From 61cb608e90c311861ed396abdc18da89c2f1895d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 11:53:50 -0700 Subject: tools/memory-model: Make judgelitmus.sh note timeouts Currently, judgelitmus.sh treats timeouts (as in the "--timeout" argument) as "!!! Verification error". This can be misleading because it is quite possible that running the test longer would have produced a verification. This commit therefore changes judgelitmus.sh to check for timeouts and to report them with "!!! Timeout". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 0cc63875e395..d3c313b9a458 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -42,6 +42,14 @@ grep '^Observation' $LKMM_DESTDIR/$litmus.out if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out then : +elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out +then + echo ' !!! Timeout' $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 124 else echo ' !!! Verification error' $litmus if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out -- cgit v1.2.1 From ab9ad415a4472de740e3bb3aeccec7b723d55f41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 13:07:46 -0700 Subject: tools/memory-model: Make cmplitmushist.sh note timeouts Currently, cmplitmushist.sh treats timeouts (as in the "--timeout" argument) as "Missing Observation line". This can be misleading because it is quite possible that running the test longer would have produced a verification. This commit therefore changes cmplitmushist.sh to check for timeouts and to report them with "Timed out". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/cmplitmushist.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh index 0f498aeeccf5..b9c174dd8004 100755 --- a/tools/memory-model/scripts/cmplitmushist.sh +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -12,12 +12,30 @@ trap 'rm -rf $T' 0 mkdir $T # comparetest oldpath newpath +timedout=0 perfect=0 obsline=0 noobsline=0 obsresult=0 badcompare=0 comparetest () { + if grep -q '^Command exited with non-zero status 124' $1 || + grep -q '^Command exited with non-zero status 124' $2 + then + if grep -q '^Command exited with non-zero status 124' $1 && + grep -q '^Command exited with non-zero status 124' $2 + then + echo Both runs timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $1 + then + echo Old run timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $2 + then + echo New run timed out: $2 + fi + timedout=`expr "$timedout" + 1` + return 0 + fi grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1 @@ -78,6 +96,10 @@ if test "$obsresult" -ne 0 then echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2 fi +if test "$timedout" -ne 0 +then + echo "!!!" Timed out: $timedout 1>&2 +fi if test "$badcompare" -ne 0 then echo "!!!" Result changed: $badcompare 1>&2 -- cgit v1.2.1 From fd99ec8d8ed510f4ecae60f8091aa97d42395cb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 13:40:57 -0700 Subject: tools/memory-model: Make judgelitmus.sh identify bad macros Currently, judgelitmus.sh treats use of unknown primitives (such as srcu_read_lock() prior to SRCU support) as "!!! Verification error". This can be misleading because it fails to call out typos and running a version LKMM on a litmus test requiring a feature not provided by that version. This commit therefore changes judgelitmus.sh to check for unknown primitives and to report them, for example, with: '!!! Current LKMM version does not know "rcu_write_lock"'. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/cmplitmushist.sh | 31 +++++++++++++++++++++++++---- tools/memory-model/scripts/judgelitmus.sh | 12 +++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh index b9c174dd8004..ca1ac8b64614 100755 --- a/tools/memory-model/scripts/cmplitmushist.sh +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -12,6 +12,7 @@ trap 'rm -rf $T' 0 mkdir $T # comparetest oldpath newpath +badmacnam=0 timedout=0 perfect=0 obsline=0 @@ -19,8 +20,26 @@ noobsline=0 obsresult=0 badcompare=0 comparetest () { - if grep -q '^Command exited with non-zero status 124' $1 || - grep -q '^Command exited with non-zero status 124' $2 + if grep -q ': Unknown macro ' $1 || grep -q ': Unknown macro ' $2 + then + if grep -q ': Unknown macro ' $1 + then + badname=`grep ': Unknown macro ' $1 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $1 + fi + if grep -q ': Unknown macro ' $2 + then + badname=`grep ': Unknown macro ' $2 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $2 + fi + badmacnam=`expr "$badmacnam" + 1` + return 0 + elif grep -q '^Command exited with non-zero status 124' $1 || + grep -q '^Command exited with non-zero status 124' $2 then if grep -q '^Command exited with non-zero status 124' $1 && grep -q '^Command exited with non-zero status 124' $2 @@ -56,7 +75,7 @@ comparetest () { return 0 fi else - echo Missing Observation line "(e.g., herd7 timeout)": $2 + echo Missing Observation line "(e.g., syntax error)": $2 noobsline=`expr "$noobsline" + 1` return 0 fi @@ -90,7 +109,7 @@ then fi if test "$noobsline" -ne 0 then - echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2 + echo Missing Observation line "(e.g., syntax error)": $noobsline 1>&2 fi if test "$obsresult" -ne 0 then @@ -100,6 +119,10 @@ if test "$timedout" -ne 0 then echo "!!!" Timed out: $timedout 1>&2 fi +if test "$badmacnam" -ne 0 +then + echo "!!!" Unknown primitive: $badmacnam 1>&2 +fi if test "$badcompare" -ne 0 then echo "!!!" Result changed: $badcompare 1>&2 diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d3c313b9a458..d40439c7b71e 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -42,6 +42,18 @@ grep '^Observation' $LKMM_DESTDIR/$litmus.out if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out then : +elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out +then + badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus" + echo $badmsg + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 254 elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out then echo ' !!! Timeout' $litmus -- cgit v1.2.1 From 30aee4f9caf8f5783970c8231f552714e03df723 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 14:27:06 -0700 Subject: tools/memory-model: Make judgelitmus.sh detect hard deadlocks If a litmus test specifies "Result: Never" and if it contains an unconditional ("hard") deadlock, then running checklitmus.sh on it will not flag any errors, despite the fact that there are no executions. This commit therefore updates judgelitmus.sh to complain about tests with no executions that are marked, but not as "Result: DEADLOCK". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d40439c7b71e..84c62eee321b 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -83,6 +83,14 @@ then fi ret=1 fi +elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' +then + echo " !!! Unexpected non-$outcome deadlock" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe then ret=0 -- cgit v1.2.1 From d5c771df186390fc5dc5d2609aae20a2f56c2182 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 07:33:18 -0700 Subject: tools/memory-model: Fix paulmck email address on pre-existing scripts Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 2 +- tools/memory-model/scripts/checklitmus.sh | 2 +- tools/memory-model/scripts/checklitmushist.sh | 2 +- tools/memory-model/scripts/judgelitmus.sh | 2 +- tools/memory-model/scripts/newlitmushist.sh | 2 +- tools/memory-model/scripts/parseargs.sh | 2 +- tools/memory-model/scripts/runlitmushist.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 3c0c7fbbd223..10e14d94acee 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -17,7 +17,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 11461ed40b5e..638b8c610894 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -15,7 +15,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney litmus=$1 herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh index 1d210ffb7c8a..406ecfc0aee4 100755 --- a/tools/memory-model/scripts/checklitmushist.sh +++ b/tools/memory-model/scripts/checklitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 84c62eee321b..d82133e75580 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -13,7 +13,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney litmus=$1 diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh index 991f8f814881..3f4b06e29988 100755 --- a/tools/memory-model/scripts/newlitmushist.sh +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 40f52080fdbd..afe7bd23de6b 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -9,7 +9,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney T=/tmp/parseargs.sh.$$ mkdir $T diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index 6ed376f495bb..852786fef179 100755 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -13,7 +13,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney T=/tmp/runlitmushist.sh.$$ trap 'rm -rf $T' 0 -- cgit v1.2.1 From a9ce6100e18b5862484301ef36fea926fc04e7f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 15:59:26 -0700 Subject: tools/memory-model: Update parseargs.sh for hardware verification This commit adds a --hw argument to parseargs.sh to specify the CPU family for a hardware verification. For example, "--hw AArch64" will specify that a C-language litmus test is to be translated to ARMv8 and the result verified. This will set the LKMM_HW_MAP_FILE environment variable accordingly. If there is no --hw argument, this environment variable will be set to the empty string. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index afe7bd23de6b..5f016fc3f3af 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -27,6 +27,7 @@ initparam () { initparam LKMM_DESTDIR "." initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg" +initparam LKMM_HW_MAP_FILE "" initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN` initparam LKMM_PROCS "3" initparam LKMM_TIMEOUT "1m" @@ -37,10 +38,11 @@ usagehelp () { echo "Usage $scriptname [ arguments ]" echo " --destdir path (place for .litmus.out, default by .litmus)" echo " --herdopts -conf linux-kernel.cfg ..." + echo " --hw AArch64" echo " --jobs N (number of jobs, default one per CPU)" echo " --procs N (litmus tests with at most this many processes)" echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')" - echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" + echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --hw '$LKMM_HW_MAP_FILE' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" exit 1 } @@ -95,6 +97,11 @@ do LKMM_HERD_OPTIONS="$2" shift ;; + --hw) + checkarg --hw "(.map file architecture name)" "$#" "$2" '^[A-Za-z0-9_-]\+' '^--' + LKMM_HW_MAP_FILE="$2" + shift + ;; -j[1-9]*) njobs="`echo $1 | sed -e 's/^-j//'`" trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`" -- cgit v1.2.1 From 95ea741f39244b691a48a8c101687844087be750 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 14:39:10 -0700 Subject: tools/memory-model: Make judgelitmus.sh handle hardware verifications This commit makes the judgelitmus.sh script check the --hw argument (AKA the LKMM_HW_MAP_FILE environment variable) and to adjust its judgment for a run where a C-language litmus test has been translated to assembly and the assembly version verified. In this case, the assembly verification output is checked against the C-language script's "Result:" comment. However, because hardware can be stronger than LKMM requires, the judgelitmus.sh script forgives verification mismatches featuring a "Sometimes" in the C-language script and an "Always" or "Never" assembly-language verification. Note that deadlock is not forgiven, however, this should not normally be an issue given that C-language tests containing locking, RCU, or SRCU cannot be translated to assembly. However, this issue can crop up in litmus tests that mimic deadlock by using the "filter" clause to ignore all executions. It can also crop up when certain herd arguments are used to autofilter everything that does not match the "exists" clause in cases where the "exists" clause cannot be satisfied. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/README | 8 ++-- tools/memory-model/scripts/judgelitmus.sh | 75 +++++++++++++++++++------------ 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 095c7eb36f9f..0e29a52044c1 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -43,10 +43,10 @@ initlitmushist.sh judgelitmus.sh - Given a .litmus file and its .litmus.out herd7 output, check the - .litmus.out file against the .litmus file's "Result:" comment to - judge whether the test ran correctly. Not normally run manually, - provided instead for use by other scripts. + Given a .litmus file and its herd7 output, check the output file + against the .litmus file's "Result:" comment to judge whether + the test ran correctly. Not normally run manually, provided + instead for use by other scripts. newlitmushist.sh diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d82133e75580..6f3c60065c8b 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -1,9 +1,14 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Given a .litmus test and the corresponding .litmus.out file, check -# the .litmus.out file against the "Result:" comment to judge whether -# the test ran correctly. +# Given a .litmus test and the corresponding litmus output file, check +# the .litmus.out file against the "Result:" comment to judge whether the +# test ran correctly. If the --hw argument is omitted, check against the +# LKMM output, which is assumed to be in file.litmus.out. If this argument +# is provided, this is assumed to be a hardware test, and the output is +# assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. +# In addition, non-Sometimes verification results will be noted, but +# forgiven. # # Usage: # judgelitmus.sh file.litmus @@ -24,11 +29,18 @@ else echo ' --- ' error: \"$litmus\" is not a readable file exit 255 fi -if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out +if test -z "$LKMM_HW_MAP_FILE" +then + litmusout=$litmus.out +else + litmusout="`echo $litmus | + sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" +fi +if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" then : else - echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file + echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file exit 255 fi if grep -q '^ \* Result: ' $litmus @@ -38,69 +50,76 @@ else outcome=specified fi -grep '^Observation' $LKMM_DESTDIR/$litmus.out -if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out +grep '^Observation' $LKMM_DESTDIR/$litmusout +if grep -q '^Observation' $LKMM_DESTDIR/$litmusout then : -elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out +elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout then - badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out | + badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout | sed -e 's/^.*: Unknown macro //' | sed -e 's/ (User error).*$//'` badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus" echo $badmsg - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 254 -elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out +elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmusout then echo ' !!! Timeout' $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 124 else echo ' !!! Verification error' $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 255 fi if test "$outcome" = DEADLOCK then - if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' + if grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' then ret=0 else echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi ret=1 fi -elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' then echo " !!! Unexpected non-$outcome deadlock" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmusout 2>&1 fi ret=1 -elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$outcome" = Maybe then ret=0 else - echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if test -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + flag="--- Forgiven" + ret=0 + else + flag="!!! Unexpected" + ret=1 + fi + echo " $flag non-$outcome verification" $litmus + if ! grep -qe "$flag" $LKMM_DESTDIR/$litmusout + then + echo " $flag non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi - ret=1 fi -tail -2 $LKMM_DESTDIR/$litmus.out | head -1 +tail -2 $LKMM_DESTDIR/$litmusout | head -1 exit $ret -- cgit v1.2.1 From 8f74402562817c5cb2d4285c7e16b8a39751becd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 16:21:09 -0700 Subject: tools/memory-model: Add simpletest.sh to check locking, RCU, and SRCU This commit abstracts out common function to check a given litmus test for locking, RCU, and SRCU in order to avoid duplicating code. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/simpletest.sh | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 tools/memory-model/scripts/simpletest.sh diff --git a/tools/memory-model/scripts/simpletest.sh b/tools/memory-model/scripts/simpletest.sh new file mode 100755 index 000000000000..7edc5d361665 --- /dev/null +++ b/tools/memory-model/scripts/simpletest.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Give zero status if this is a simple test and non-zero otherwise. +# Simple tests do not contain locking, RCU, or SRCU. +# +# Usage: +# simpletest.sh file.litmus +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + + +litmus=$1 + +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi +exclude="^[[:space:]]*\(" +exclude="${exclude}spin_lock(\|spin_unlock(\|spin_trylock(\|spin_is_locked(" +exclude="${exclude}\|rcu_read_lock(\|rcu_read_unlock(" +exclude="${exclude}\|synchronize_rcu(\|synchronize_rcu_expedited(" +exclude="${exclude}\|srcu_read_lock(\|srcu_read_unlock(" +exclude="${exclude}\|synchronize_srcu(\|synchronize_srcu_expedited(" +exclude="${exclude}\)" +if grep -q $exclude $litmus +then + exit 255 +fi +exit 0 -- cgit v1.2.1 From 69e0a5867dae05872702e4e30202dc0cacd7964f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 16:37:01 -0700 Subject: tools/memory-model: Fix checkalllitmus.sh comment The checkalllitmus.sh runs litmus tests in the litmus-tests directory, not those in the github archive, so this commit updates the comment to reflect this reality. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 10e14d94acee..54d8da8c338e 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -30,8 +30,8 @@ else exit 255 fi -# Create any new directories that have appeared in the github litmus -# repo since the last run. +# Create any new directories that have appeared in the litmus-tests +# directory since the last run. if test "$LKMM_DESTDIR" != "." then find $litmusdir -type d -print | -- cgit v1.2.1 From c19b8534e9187b6eff782e5f74cdd71f7dc93330 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 12:39:27 -0700 Subject: tools/memory-model: Hardware checking for check{,all}litmus.sh This commit makes checklitmus.sh and checkalllitmus.sh check to see if a hardware verification was specified (via the --hw command-line argument, which sets the LKMM_HW_MAP_FILE environment variable). If so, the C-language litmus test is converted to the specified type of assembly-language litmus test and herd is run on it. Hardware is permitted to be stronger than LKMM requires, so "Always" and "Never" verifications of "Sometimes" C-language litmus tests are forgiven. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 23 ++++++++------- tools/memory-model/scripts/checklitmus.sh | 42 +++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 54d8da8c338e..2d3ee850a839 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # # Run herd7 tests on all .litmus files in the litmus-tests directory @@ -8,6 +8,11 @@ # "^^^". It also outputs verification results to a file whose name is # that of the specified litmus test, but with ".out" appended. # +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# # Usage: # checkalllitmus.sh # @@ -38,21 +43,15 @@ then ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Find the checklitmus script. If it is not where we expect it, then -# assume that the caller has the PATH environment variable set -# appropriately. -if test -x scripts/checklitmus.sh -then - clscript=scripts/checklitmus.sh -else - clscript=checklitmus.sh -fi - # Run the script on all the litmus tests in the specified directory ret=0 for i in $litmusdir/*.litmus do - if ! $clscript $i + if test -n "$LKMM_HW_MAP_FILE" && ! scripts/simpletest.sh $i + then + continue + fi + if ! scripts/checklitmus.sh $i then ret=1 fi diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 638b8c610894..42ff11869cd6 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -6,6 +6,11 @@ # results to a file whose name is that of the specified litmus test, but # with ".out" appended. # +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# # Usage: # checklitmus.sh file.litmus # @@ -18,8 +23,6 @@ # Author: Paul E. McKenney litmus=$1 -herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} - if test -f "$litmus" -a -r "$litmus" then : @@ -28,7 +31,38 @@ else exit 255 fi -echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +if test -z "$LKMM_HW_MAP_FILE" +then + # LKMM run + herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} + echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +else + # Hardware run + + T=/tmp/checklitmushw.sh.$$ + trap 'rm -rf $T' 0 2 + mkdir $T + + # Generate filenames + catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" + mapfile="Linux2${LKMM_HW_MAP_FILE}.map" + themefile="$T/${LKMM_HW_MAP_FILE}.theme" + herdoptions="-model $LKMM_HW_CAT_FILE" + hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` + hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` + + # Don't run on litmus tests with complex synchronization + if ! scripts/simpletest.sh $litmus + then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 + fi + + # Generate the assembly code and run herd7 on it. + gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile + jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +fi scripts/judgelitmus.sh $litmus -- cgit v1.2.1 From a05fec8cfaeae52bc34187ec8acf53151404568e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 14:37:46 -0700 Subject: tools/memory-model: Make judgelitmus.sh ransack .litmus.out files The judgelitmus.sh script currently relies solely on the "Result:" comment in the .litmus file. This is problematic when using the --hw argument, because it is necessary to check the hardware model against LKMM even in the absence of "Result:" comments. This commit therefore modifies judgelitmus.sh to check the observation in a .litmus.out file, in case one was generated by a previous LKMM run. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 6f3c60065c8b..fe9131f8eb96 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -8,7 +8,9 @@ # is provided, this is assumed to be a hardware test, and the output is # assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. # In addition, non-Sometimes verification results will be noted, but -# forgiven. +# forgiven. Furthermore, if there is no "Result:" comment but there is +# an LKMM .litmus.out file, the observation in that file will be used +# to judge the assembly-language verification. # # Usage: # judgelitmus.sh file.litmus @@ -32,9 +34,11 @@ fi if test -z "$LKMM_HW_MAP_FILE" then litmusout=$litmus.out + lkmmout= else litmusout="`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" + lkmmout=$litmus.out fi if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" then @@ -46,6 +50,9 @@ fi if grep -q '^ \* Result: ' $litmus then outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` +elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1 +then + outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'` else outcome=specified fi -- cgit v1.2.1 From bc99d9407f30816a1ce30b64428a5d41aafd4270 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 14:57:56 -0700 Subject: tools/memory-model: Split runlitmus.sh out of checklitmus.sh This commit prepares for adding --hw capability to github litmus-test scripts by splitting runlitmus.sh (which simply runs the verification) out of checklitmus.sh (which also judges the results). Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checklitmus.sh | 57 ++----------------------- tools/memory-model/scripts/runlitmus.sh | 69 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 53 deletions(-) create mode 100755 tools/memory-model/scripts/runlitmus.sh diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 42ff11869cd6..4c1d0cf0ddad 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -1,15 +1,8 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Run a herd7 test and invokes judgelitmus.sh to check the result against -# a "Result:" comment within the litmus test. It also outputs verification -# results to a file whose name is that of the specified litmus test, but -# with ".out" appended. -# -# If the --hw argument is specified, this script translates the .litmus -# C-language file to the specified type of assembly and verifies that. -# But in this case, litmus tests using complex synchronization (such as -# locking, RCU, and SRCU) are cheerfully ignored. +# Invokes runlitmus.sh and judgelitmus.sh on its arguments to run the +# specified litmus test and pass judgment on the results. # # Usage: # checklitmus.sh file.litmus @@ -22,47 +15,5 @@ # # Author: Paul E. McKenney -litmus=$1 -if test -f "$litmus" -a -r "$litmus" -then - : -else - echo ' --- ' error: \"$litmus\" is not a readable file - exit 255 -fi - -if test -z "$LKMM_HW_MAP_FILE" -then - # LKMM run - herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} - echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 -else - # Hardware run - - T=/tmp/checklitmushw.sh.$$ - trap 'rm -rf $T' 0 2 - mkdir $T - - # Generate filenames - catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" - mapfile="Linux2${LKMM_HW_MAP_FILE}.map" - themefile="$T/${LKMM_HW_MAP_FILE}.theme" - herdoptions="-model $LKMM_HW_CAT_FILE" - hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` - hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` - - # Don't run on litmus tests with complex synchronization - if ! scripts/simpletest.sh $litmus - then - echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU - exit 254 - fi - - # Generate the assembly code and run herd7 on it. - gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile - jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 -fi - -scripts/judgelitmus.sh $litmus +scripts/runlitmus.sh $1 +scripts/judgelitmus.sh $1 diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh new file mode 100755 index 000000000000..91af859c0e90 --- /dev/null +++ b/tools/memory-model/scripts/runlitmus.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Without the -hw argument, runs a herd7 test and outputs verification +# results to a file whose name is that of the specified litmus test, +# but with ".out" appended. +# +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# +# Either way, return the status of the herd7 command. +# +# Usage: +# runlitmus.sh file.litmus +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The caller is expected to have +# properly set up the LKMM environment variables. +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + +litmus=$1 +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi + +if test -z "$LKMM_HW_MAP_FILE" +then + # LKMM run + herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} + echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +else + # Hardware run + + T=/tmp/checklitmushw.sh.$$ + trap 'rm -rf $T' 0 2 + mkdir $T + + # Generate filenames + catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" + mapfile="Linux2${LKMM_HW_MAP_FILE}.map" + themefile="$T/${LKMM_HW_MAP_FILE}.theme" + herdoptions="-model $LKMM_HW_CAT_FILE" + hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` + hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` + + # Don't run on litmus tests with complex synchronization + if ! scripts/simpletest.sh $litmus + then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 + fi + + # Generate the assembly code and run herd on it. + gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile + jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +fi + +exit $? -- cgit v1.2.1 From 3f15694e4087a27734b9526a3eabd0d7a38cab11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 16:41:41 -0700 Subject: tools/memory-model: Make runlitmus.sh generate .litmus.out for --hw In the absence of "Result:" comments, the runlitmus.sh script relies on litmus.out files from prior LKMM runs. This can be a bit user-hostile, so this commit makes runlitmus.sh generate any needed .litmus.out files that don't already exist. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 54 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 91af859c0e90..2865a9661b07 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -28,42 +28,48 @@ if test -f "$litmus" -a -r "$litmus" then : else - echo ' --- ' error: \"$litmus\" is not a readable file + echo ' !!! ' error: \"$litmus\" is not a readable file exit 255 fi -if test -z "$LKMM_HW_MAP_FILE" +if test -z "$LKMM_HW_MAP_FILE" -o ! -e $LKMM_DESTDIR/$litmus.out then # LKMM run herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 -else - # Hardware run + ret=$? + if test -z "$LKMM_HW_MAP_FILE" + then + exit $ret + fi + echo " --- " Automatically generated LKMM output for '"'--hw $LKMM_HW_MAP_FILE'"' run +fi - T=/tmp/checklitmushw.sh.$$ - trap 'rm -rf $T' 0 2 - mkdir $T +# Hardware run - # Generate filenames - catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" - mapfile="Linux2${LKMM_HW_MAP_FILE}.map" - themefile="$T/${LKMM_HW_MAP_FILE}.theme" - herdoptions="-model $LKMM_HW_CAT_FILE" - hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` - hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` +T=/tmp/checklitmushw.sh.$$ +trap 'rm -rf $T' 0 2 +mkdir $T - # Don't run on litmus tests with complex synchronization - if ! scripts/simpletest.sh $litmus - then - echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU - exit 254 - fi +# Generate filenames +catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" +mapfile="Linux2${LKMM_HW_MAP_FILE}.map" +themefile="$T/${LKMM_HW_MAP_FILE}.theme" +herdoptions="-model $LKMM_HW_CAT_FILE" +hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` +hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` - # Generate the assembly code and run herd on it. - gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile - jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +# Don't run on litmus tests with complex synchronization +if ! scripts/simpletest.sh $litmus +then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 fi +# Generate the assembly code and run herd7 on it. +gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile +jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 + exit $? -- cgit v1.2.1 From 082ab4692c45ff75cecf1ea3d521789c13187f9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 14:06:27 -0700 Subject: tools/memory-model: Move from .AArch64.litmus.out to .litmus.AArch.out When the github scripts see ".litmus.out", they assume that there must be a corresponding C-language ".litmus" file. Won't they be disappointed when they instead see nothing, or, worse yet, the corresponding assembly-language litmus test? This commit therefore swaps the hardware tag with the "litmus" to avoid this sort of disappointment. This commit also adjusts the .gitignore file so as to avoid adding these new ".out" files to git. [ paulmck: Apply Akira Yokosawa feedback. ] Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/.gitignore | 2 +- tools/memory-model/scripts/judgelitmus.sh | 4 ++-- tools/memory-model/scripts/runlitmus.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore index 6e2ddc54152f..f47cb2045f13 100644 --- a/tools/memory-model/litmus-tests/.gitignore +++ b/tools/memory-model/litmus-tests/.gitignore @@ -1 +1 @@ -*.litmus.out +*.out diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index fe9131f8eb96..9abda72fe013 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -6,7 +6,7 @@ # test ran correctly. If the --hw argument is omitted, check against the # LKMM output, which is assumed to be in file.litmus.out. If this argument # is provided, this is assumed to be a hardware test, and the output is -# assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. +# assumed to be in file.litmus.HW.out, where "HW" is the --hw argument. # In addition, non-Sometimes verification results will be noted, but # forgiven. Furthermore, if there is no "Result:" comment but there is # an LKMM .litmus.out file, the observation in that file will be used @@ -37,7 +37,7 @@ then lkmmout= else litmusout="`echo $litmus | - sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" + sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'`.out" lkmmout=$litmus.out fi if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 2865a9661b07..c84124b32bee 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -57,7 +57,7 @@ catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" mapfile="Linux2${LKMM_HW_MAP_FILE}.map" themefile="$T/${LKMM_HW_MAP_FILE}.theme" herdoptions="-model $LKMM_HW_CAT_FILE" -hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` +hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'` hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` # Don't run on litmus tests with complex synchronization -- cgit v1.2.1 From b9f66cb583e39ae8322722e96731a8fe5b388915 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 14:44:09 -0700 Subject: tools/memory-model: Keep assembly-language litmus tests This commit retains the assembly-language litmus tests generated from the C-language litmus tests, appending the hardware tag to the original C-language litmus test's filename. Thus, S+poonceonces.litmus.AArch64 contains the Armv8 assembly language corresponding to the C-language S+poonceonces.litmus test. This commit also updates the .gitignore to avoid committing these automatically generated assembly-language litmus tests. Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/.gitignore | 2 +- tools/memory-model/scripts/runlitmus.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore index f47cb2045f13..848e62d2a9b3 100644 --- a/tools/memory-model/litmus-tests/.gitignore +++ b/tools/memory-model/litmus-tests/.gitignore @@ -1 +1 @@ -*.out +*.litmus.* diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index c84124b32bee..62b47c7e1ba9 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -69,7 +69,7 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile -jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? -- cgit v1.2.1 From 276a1d50dd81e213011f8aca81becbdf40489ca5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Mar 2019 08:57:20 -0700 Subject: tools/memory-model: Allow herd to deduce CPU type Currently, the scripts specify the CPU's .cat file to herd. But this is pointless because herd will select a good and sufficient .cat file from the assembly-language litmus test itself. This commit therefore removes the -model argument to herd, allowing herd to figure the CPU family out itself. Note that the user can override herd's choice using the "--herdopts" argument to the scripts. Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 62b47c7e1ba9..afb196d7ef10 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -53,7 +53,6 @@ trap 'rm -rf $T' 0 2 mkdir $T # Generate filenames -catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" mapfile="Linux2${LKMM_HW_MAP_FILE}.map" themefile="$T/${LKMM_HW_MAP_FILE}.theme" herdoptions="-model $LKMM_HW_CAT_FILE" @@ -70,6 +69,6 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? -- cgit v1.2.1 From 1dd72050454f8528d472a1b35ae443696f13ef93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Mar 2019 17:20:51 -0700 Subject: tools/memory-model: Make runlitmus.sh check for jingle errors It turns out that the jingle7 tool is currently a bit picky about the litmus tests it is willing to process. This commit therefore ensures that jingle7 failures are reported. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index afb196d7ef10..5f2d29b460ff 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -69,6 +69,11 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out +then + echo ' !!! ' jingle7 failed, no $hwlitmus generated + exit 253 +fi /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? -- cgit v1.2.1 From d2e6c738845c25cb65a70ab2e537793d54ad16e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2019 12:34:56 -0700 Subject: tools/memory-model: Add -v flag to jingle7 runs Adding the -v flag to jingle7 invocations gives much useful information on why jingle7 didn't like a given litmus test. This commit therefore adds this flag and saves off any such information into a .err file. Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 5f2d29b460ff..dfdb1f00fcc0 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -68,10 +68,11 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile -jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +jingle7 -v -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out then - echo ' !!! ' jingle7 failed, no $hwlitmus generated + echo ' !!! ' jingle7 failed, errors in $hwlitmus.err + cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err exit 253 fi /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 -- cgit v1.2.1 From 1aac9cca908f6d3495f14c37224ec459f065d406 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Mar 2019 17:18:43 -0700 Subject: tools/memory-model: Implement --hw support for checkghlitmus.sh This commits enables the "--hw" argument for the checkghlitmus.sh script, causing it to convert any applicable C-language litmus tests to the specified flavor of assembly language, to verify these assembly-language litmus tests, and checking compatibility of the outcomes. Note that the conversion does not yet handle locking, RCU, SRCU, plain C-language memory accesses, or casts. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkghlitmus.sh | 9 +++++---- tools/memory-model/scripts/hwfnseg.sh | 20 ++++++++++++++++++++ tools/memory-model/scripts/runlitmushist.sh | 27 +++++++++++++++++---------- 3 files changed, 42 insertions(+), 14 deletions(-) create mode 100755 tools/memory-model/scripts/hwfnseg.sh diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh index 6589fbb6f653..2ea220d2564b 100755 --- a/tools/memory-model/scripts/checkghlitmus.sh +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -10,6 +10,7 @@ # parseargs.sh scripts for arguments. . scripts/parseargs.sh +. scripts/hwfnseg.sh T=/tmp/checkghlitmus.sh.$$ trap 'rm -rf $T' 0 @@ -32,9 +33,9 @@ then ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Create a list of the C-language litmus tests previously run. -( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | - sed -e 's/\.out$//' | +# Create a list of the specified litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name "*.litmus${hwfnseg}.out" -print ) | + sed -e "s/${hwfnseg}"'\.out$//' | xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already @@ -44,7 +45,7 @@ find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short -# Form list of tests without corresponding .litmus.out files +# Form list of tests without corresponding .out files sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed # Run any needed tests. diff --git a/tools/memory-model/scripts/hwfnseg.sh b/tools/memory-model/scripts/hwfnseg.sh new file mode 100755 index 000000000000..580c3281181c --- /dev/null +++ b/tools/memory-model/scripts/hwfnseg.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Generate the hardware extension to the litmus-test filename, or the +# empty string if this is an LKMM run. The extension is placed in +# the shell variable hwfnseg. +# +# Usage: +# . hwfnseg.sh +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + +if test -z "$LKMM_HW_MAP_FILE" +then + hwfnseg= +else + hwfnseg=".$LKMM_HW_MAP_FILE" +fi diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index 852786fef179..c6c2bdc67a50 100755 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -15,6 +15,8 @@ # # Author: Paul E. McKenney +. scripts/hwfnseg.sh + T=/tmp/runlitmushist.sh.$$ trap 'rm -rf $T' 0 mkdir $T @@ -30,15 +32,12 @@ fi # Prefixes for per-CPU scripts for ((i=0;i<$LKMM_JOBS;i++)) do - echo dir="$LKMM_DESTDIR" > $T/$i.sh echo T=$T >> $T/$i.sh - echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh cat << '___EOF___' >> $T/$i.sh runtest () { - echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1' - if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1 + if scripts/runlitmus.sh $1 then - if ! grep -q '^Observation ' $dir/$1.out + if ! grep -q '^Observation ' $LKMM_DESTDIR/$1$2.out then echo ' !!! Herd failed, no Observation:' $1 fi @@ -47,10 +46,16 @@ do if test "$exitcode" -eq 124 then exitmsg="timed out" + elif test "$exitcode" -eq 253 + then + exitmsg= else exitmsg="failed, exit code $exitcode" fi - echo ' !!! Herd' ${exitmsg}: $1 + if test -n "$exitmsg" + then + echo ' !!! Herd' ${exitmsg}: $1 + fi fi } ___EOF___ @@ -59,11 +64,13 @@ done awk -v q="'" -v b='\\' ' { print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0 -}' | bash | -sort -k1n | -awk -v ncpu=$LKMM_JOBS -v t=$T ' +}' | sh | sort -k1n | +awk -v dq='"' -v hwfnseg="$hwfnseg" -v ncpu="$LKMM_JOBS" -v t="$T" ' { - print "runtest " $2 >> t "/" NR % ncpu ".sh"; + print "if test -z " dq hwfnseg dq " || scripts/simpletest.sh " dq $2 dq + print "then" + print "\techo runtest " dq $2 dq " " hwfnseg " >> " t "/" NR % ncpu ".sh"; + print "fi" } END { -- cgit v1.2.1 From 3db81939157a89189d9f94bd9045a010bae8aaa2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 11:47:14 -0700 Subject: tools/memory-model: Fix scripting --jobs argument The parseargs.sh regular expression for the --jobs argument incorrectly requires that the number of jobs be at least 10, that is, have at least two digits. This commit therefore adjusts this regular expression to allow single-digit numbers of jobs to be specified. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 5f016fc3f3af..25a81ac0dfdf 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -113,7 +113,7 @@ do LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`" ;; --jobs|--job|-j) - checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--' + checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]*$' '^--' LKMM_JOBS="$2" shift ;; -- cgit v1.2.1 From d72aebde6de59b063e81378afd86274f531fe968 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Apr 2019 09:27:28 -0700 Subject: tools/memory-model: Make checkghlitmus.sh use mselect7 The checkghlitmus.sh script currently uses grep to ignore non-C-language litmus tests, which is a bit fragile. This commit therefore enlists the aid of "mselect7 -arch C", given Luc Maraget's recent modifications that allow mselect7 to operate in filter mode. This change requires herdtools 7.52-32-g1da3e0e50977 or later. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkghlitmus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh index 2ea220d2564b..cedd0290b73f 100755 --- a/tools/memory-model/scripts/checkghlitmus.sh +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -41,7 +41,7 @@ fi # Create a list of C-language litmus tests with "Result:" commands and # no more than the specified number of processes. -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short -- cgit v1.2.1 From a80e47b8669da7b23c639b64387e0a99ffa84053 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Apr 2019 10:02:23 -0700 Subject: tools/memory-model: Make history-check scripts use mselect7 The history-check scripts currently use grep to ignore non-C-language litmus tests, which is a bit fragile. This commit therefore enlists the aid of "mselect7 -arch C", given Luc Maraget's recent modifications that allow mselect7 to operate in filter mode. This change requires herdtools 7.52-32-g1da3e0e50977 or later. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/initlitmushist.sh | 2 +- tools/memory-model/scripts/newlitmushist.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh index 956b6957484d..31ea782955d3 100755 --- a/tools/memory-model/scripts/initlitmushist.sh +++ b/tools/memory-model/scripts/initlitmushist.sh @@ -60,7 +60,7 @@ fi # Create a list of the C-language litmus tests with no more than the # specified number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short scripts/runlitmushist.sh < $T/list-C-short diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh index 3f4b06e29988..25235e2049cf 100755 --- a/tools/memory-model/scripts/newlitmushist.sh +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -43,7 +43,7 @@ fi # Form full list of litmus tests with no more than the specified # number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C-all xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short # Form list of new tests. Note: This does not handle litmus-test deletion! -- cgit v1.2.1 From 4959d07db241a6a028fb01ab2e4637742050973b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 09:51:57 -0700 Subject: tools/memory-model: Add "--" to parseargs.sh for additional arguments Currently, parseargs.sh expects to consume all the command-line arguments, which prevents the calling script from having any of its own arguments. This commit therefore causes parseargs.sh to stop consuming arguments when it encounters a "--" argument, leaving any remaining arguments for the calling script. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 25a81ac0dfdf..7aa58755adfc 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -83,7 +83,7 @@ do echo "Cannot create directory --destdir '$LKMM_DESTDIR'" usage fi - if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" + if test -d "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" then : else @@ -127,6 +127,10 @@ do LKMM_TIMEOUT="$2" shift ;; + --) + shift + break + ;; *) echo Unknown argument $1 usage -- cgit v1.2.1 From 7bfb126843eae3d39ff2ab99f743660e10601baf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 10:03:29 -0700 Subject: tools/memory-model: Repair parseargs.sh header comment Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 7aa58755adfc..08ded5909860 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# the corresponding .litmus.out file, and does not judge the result. +# Parse arguments common to the various scripts. # # . scripts/parseargs.sh # -- cgit v1.2.1 From 461c078c85ca85ed4c3ea5c4cd93e3b4d396bcf5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 10:05:14 -0700 Subject: tools/memory-model: Add checktheselitmus.sh to run specified litmus tests This commit adds a checktheselitmus.sh script that runs the litmus tests specified on the command line. This is useful for verifying fixes to specific litmus tests. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/README | 8 +++++ tools/memory-model/scripts/checktheselitmus.sh | 43 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100755 tools/memory-model/scripts/checktheselitmus.sh diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 0e29a52044c1..cc2c4e5be9ec 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -27,6 +27,14 @@ checklitmushist.sh checklitmus.sh Check a single litmus test against its "Result:" expected result. + Not intended to for manual use. + +checktheselitmus.sh + + Check the specified list of litmus tests against their "Result:" + expected results. This takes optional parseargs.sh arguments, + followed by "--" followed by pathnames starting from the current + directory. cmplitmushist.sh diff --git a/tools/memory-model/scripts/checktheselitmus.sh b/tools/memory-model/scripts/checktheselitmus.sh new file mode 100755 index 000000000000..10eeb5ecea6d --- /dev/null +++ b/tools/memory-model/scripts/checktheselitmus.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Invokes checklitmus.sh on its arguments to run the specified litmus +# test and pass judgment on the results. +# +# Usage: +# checktheselitmus.sh -- [ file1.litmus [ file2.litmus ... ] ] +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The usual parseargs.sh arguments +# can be specified prior to the "--". +# +# This script is intended for use with pathnames that start from the +# tools/memory-model directory. If some of the pathnames instead start at +# the root directory, they all must do so and the "--destdir /" parseargs.sh +# argument must be specified prior to the "--". Alternatively, some other +# "--destdir" argument can be supplied as long as the needed subdirectories +# are populated. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney + +. scripts/parseargs.sh + +ret=0 +for i in "$@" +do + if scripts/checklitmus.sh $i + then + : + else + ret=1 + fi +done +if test "$ret" -ne 0 +then + echo " ^^^ VERIFICATION MISMATCHES" 1>&2 +else + echo All litmus tests verified as was expected. 1>&2 +fi +exit $ret -- cgit v1.2.1 From 232290bb8b670c66d48e83609894a470ac97c8aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 May 2019 07:34:20 -0700 Subject: tools/memory-model: Add data-race capabilities to judgelitmus.sh This commit adds functionality to judgelitmus.sh to allow it to handle both the "DATARACE" markers in the "Result:" comments in litmus tests and the "Flag data-race" markers in LKMM output. For C-language tests, if either marker is present, the other must also be as well, at least for litmus tests having a "Result:" comment. If the LKMM output indicates a data race, then failures of the Always/Sometimes/Never portion of the "Result:" prediction are forgiven. The reason for forgiving "Result:" mispredictions is that data races can result in "interesting" compiler optimizations, so that all bets are off in the data-race case. [ paulmck: Apply Akira Yokosawa feedback. ] Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 40 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 9abda72fe013..2700481d20f0 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -4,13 +4,19 @@ # Given a .litmus test and the corresponding litmus output file, check # the .litmus.out file against the "Result:" comment to judge whether the # test ran correctly. If the --hw argument is omitted, check against the -# LKMM output, which is assumed to be in file.litmus.out. If this argument -# is provided, this is assumed to be a hardware test, and the output is -# assumed to be in file.litmus.HW.out, where "HW" is the --hw argument. -# In addition, non-Sometimes verification results will be noted, but -# forgiven. Furthermore, if there is no "Result:" comment but there is -# an LKMM .litmus.out file, the observation in that file will be used -# to judge the assembly-language verification. +# LKMM output, which is assumed to be in file.litmus.out. If either a +# "DATARACE" marker in the "Result:" comment or a "Flag data-race" marker +# in the LKMM output is present, the other must also be as well, at least +# for litmus tests having a "Result:" comment. In this case, a failure of +# the Always/Sometimes/Never portion of the "Result:" prediction will be +# noted, but forgiven. +# +# If the --hw argument is provided, this is assumed to be a hardware +# test, and the output is assumed to be in file.litmus.HW.out, where +# "HW" is the --hw argument. In addition, non-Sometimes verification +# results will be noted, but forgiven. Furthermore, if there is no +# "Result:" comment but there is an LKMM .litmus.out file, the observation +# in that file will be used to judge the assembly-language verification. # # Usage: # judgelitmus.sh file.litmus @@ -47,9 +53,27 @@ else echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file exit 255 fi +if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout" +then + datarace_modeled=1 +fi if grep -q '^ \* Result: ' $litmus then outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` + if grep -m1 '^ \* Result: .* DATARACE' $litmus + then + datarace_predicted=1 + fi + if test -n "$datarace_predicted" -a -z "$datarace_modeled" -a -z "$LKMM_HW_MAP_FILE" + then + echo '!!! Predicted data race not modeled' $litmus + exit 252 + elif test -z "$datarace_predicted" -a -n "$datarace_modeled" + then + # Note that hardware models currently don't model data races + echo '!!! Unexpected data race modeled' $litmus + exit 253 + fi elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1 then outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'` @@ -114,7 +138,7 @@ elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$o then ret=0 else - if test -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes + if test \( -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes \) -o -n "$datarace_modeled" then flag="--- Forgiven" ret=0 -- cgit v1.2.1 From ada345f91158db06b6a4a47a46b2f4ee7d6ac742 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Jun 2019 02:13:27 -0700 Subject: tools/memory-model: Make judgelitmus.sh handle scripted Result: tag The scripts that generate the litmus tests in the "auto" directory of the https://github.com/paulmckrcu/litmus archive place the "Result:" tag into a single-line ocaml comment, which judgelitmus.sh currently does not recognize. This commit therefore makes judgelitmus.sh recognize both the multiline comment format that it currently does and the automatically generated single-line format. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 2700481d20f0..1ec5d89fcfbb 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -57,10 +57,10 @@ if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout" then datarace_modeled=1 fi -if grep -q '^ \* Result: ' $litmus +if grep -q '^[( ]\* Result: ' $litmus then - outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` - if grep -m1 '^ \* Result: .* DATARACE' $litmus + outcome=`grep -m 1 '^[( ]\* Result: ' $litmus | awk '{ print $3 }'` + if grep -m1 '^[( ]\* Result: .* DATARACE' $litmus then datarace_predicted=1 fi -- cgit v1.2.1 From 5887751baba7ce39d88b154d7c1a4d53f4608509 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jun 2019 22:30:32 -0700 Subject: tools/memory-model: Use "-unroll 0" to keep --hw runs finite Litmus tests involving atomic operations produce LL/SC loops on a number of architectures, and unrolling these loops can result in excessive verification times or even stack overflows. This commit therefore uses the "-unroll 0" herd7 argument to avoid unrolling, on the grounds that additional passes through an LL/SC loop should not change the verification. Note however, that certain bugs in the mapping of the LL/SC loop to machine instructions may go undetected. On the other hand, herd7 might not be the best vehicle for finding such bugs in any case. (You do stress-test your architecture-specific code, don't you?) Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index dfdb1f00fcc0..94608d4b6502 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -75,6 +75,6 @@ then cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err exit 253 fi -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -unroll 0 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? -- cgit v1.2.1 From 15c7c972cd26d89a26788e609c53b5a465324a6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Oct 2019 18:53:18 -0700 Subject: rcu: Use *_ONCE() to protect lockless ->expmask accesses The rcu_node structure's ->expmask field is accessed locklessly when starting a new expedited grace period and when reporting an expedited RCU CPU stall warning. This commit therefore handles the former by taking a snapshot of ->expmask while the lock is held and the latter by applying READ_ONCE() to lockless reads and WRITE_ONCE() to the corresponding updates. Link: https://lore.kernel.org/lkml/CANpmjNNmSOagbTpffHr4=Yedckx9Rm2NuGqC9UqE+AOz5f1-ZQ@mail.gmail.com Reported-by: syzbot+134336b86f728d6e55a0@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..69c5aa64fcfd 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void) rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; + WRITE_ONCE(rnp->expmask, rnp->expmaskinit); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); } } @@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -372,12 +372,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (!(mask_ofl_ipi & mask)) - continue; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; @@ -491,7 +489,7 @@ static void synchronize_sched_expedited_wait(void) struct rcu_data *rdp; mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); @@ -503,7 +501,8 @@ static void synchronize_sched_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + READ_ONCE(rnp_root->expmask), + ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -513,7 +512,7 @@ static void synchronize_sched_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, + READ_ONCE(rnp->expmask), ".T"[!!rnp->exp_tasks]); } pr_cont("\n"); @@ -521,7 +520,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; dump_cpu_task(cpu); } -- cgit v1.2.1 From 9f08cf088676c12a5b53bd5a29cf04f00c787b5d Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 8 Oct 2019 13:01:40 +0800 Subject: rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() The "mask_ofl_ipi" is used to track which CPUs get IPIed, however in the IPI sending loop, "mask_ofl_ipi" along with another variable "mask_ofl_test" might also get modified to record which CPUs' quiesent states must be reported by the sync_rcu_exp_select_node_cpus() at the end of sync_rcu_exp_select_node_cpus(). This overlap of roles can be confusing, so this patch cleans things a little by using "mask_ofl_ipi" solely for determining which CPUs must be IPIed and "mask_ofl_test" for solely determining on behalf of which CPUs sync_rcu_exp_select_node_cpus() must report a quiscent state. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 69c5aa64fcfd..6a6f328a5f52 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -387,10 +387,10 @@ retry_ipi: } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); put_cpu(); - if (!ret) { - mask_ofl_ipi &= ~mask; + /* The CPU will report the QS in response to the IPI. */ + if (!ret) continue; - } + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rnp->qsmaskinitnext & mask) && @@ -401,13 +401,12 @@ retry_ipi: schedule_timeout_uninterruptible(1); goto retry_ipi; } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + /* CPU really is offline, so we must report its QS. */ + if (rnp->expmask & mask) + mask_ofl_test |= mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } -- cgit v1.2.1 From 6cf539a87a61a4fbc43f625267dbcbcf283872ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 9 Oct 2019 17:57:43 +0200 Subject: rcu: Fix data-race due to atomic_t copy-by-value This fixes a data-race where `atomic_t dynticks` is copied by value. The copy is performed non-atomically, resulting in a data-race if `dynticks` is updated concurrently. This data-race was found with KCSAN: ================================================================== BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3: atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline] rcu_dynticks_snap kernel/rcu/tree.c:310 [inline] dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline] rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799 kthread+0x1b5/0x200 kernel/kthread.c:255 read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7: rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline] rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870 irq_enter+0x5/0x50 kernel/softirq.c:347 Reported by Kernel Concurrency Sanitizer on: CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn ================================================================== Signed-off-by: Marco Elver Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Ingo Molnar Cc: Dmitry Vyukov Cc: rcu@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 4 ++-- kernel/rcu/tree.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 66122602bd08..697e2c0624dc 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -449,7 +449,7 @@ TRACE_EVENT_RCU(rcu_fqs, */ TRACE_EVENT_RCU(rcu_dyntick, - TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks), + TP_PROTO(const char *polarity, long oldnesting, long newnesting, int dynticks), TP_ARGS(polarity, oldnesting, newnesting, dynticks), @@ -464,7 +464,7 @@ TRACE_EVENT_RCU(rcu_dyntick, __entry->polarity = polarity; __entry->oldnesting = oldnesting; __entry->newnesting = newnesting; - __entry->dynticks = atomic_read(&dynticks); + __entry->dynticks = dynticks; ), TP_printk("%s %lx %lx %#3x", __entry->polarity, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..6145e08a1407 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); -- cgit v1.2.1 From aca2991a25da03ca96127b1d21e1f4aba41f81a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 06:51:57 -0700 Subject: rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() The code in sync_rcu_exp_select_node_cpus() calculates the current CPU's mask within its rcu_node structure's bitmasks, but this has already been computed in the ->grpmask field of that CPU's rcu_data structure. This commit therefore just uses this ->grpmask field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6a6f328a5f52..3b59c3ee42e5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -345,8 +345,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; int snap; if (raw_smp_processor_id() == cpu || @@ -373,8 +373,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { -- cgit v1.2.1 From fd6bc19d7676a060a171d1cf3dcbf6fd797eb05f Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 03:17:07 +0000 Subject: rcu: Fix missed wakeup of exp_wq waiters Tasks waiting within exp_funnel_lock() for an expedited grace period to elapse can be starved due to the following sequence of events: 1. Tasks A and B both attempt to start an expedited grace period at about the same time. This grace period will have completed when the lower four bits of the rcu_state structure's ->expedited_sequence field are 0b'0100', for example, when the initial value of this counter is zero. Task A wins, and thus does the actual work of starting the grace period, including acquiring the rcu_state structure's .exp_mutex and sets the counter to 0b'0001'. 2. Because task B lost the race to start the grace period, it waits on ->expedited_sequence to reach 0b'0100' inside of exp_funnel_lock(). This task therefore blocks on the rcu_node structure's ->exp_wq[1] field, keeping in mind that the end-of-grace-period value of ->expedited_sequence (0b'0100') is shifted down two bits before indexing the ->exp_wq[] field. 3. Task C attempts to start another expedited grace period, but blocks on ->exp_mutex, which is still held by Task A. 4. The aforementioned expedited grace period completes, so that ->expedited_sequence now has the value 0b'0100'. A kworker task therefore acquires the rcu_state structure's ->exp_wake_mutex and starts awakening any tasks waiting for this grace period. 5. One of the first tasks awakened happens to be Task A. Task A therefore releases the rcu_state structure's ->exp_mutex, which allows Task C to start the next expedited grace period, which causes the lower four bits of the rcu_state structure's ->expedited_sequence field to become 0b'0101'. 6. Task C's expedited grace period completes, so that the lower four bits of the rcu_state structure's ->expedited_sequence field now become 0b'1000'. 7. The kworker task from step 4 above continues its wakeups. Unfortunately, the wake_up_all() refetches the rcu_state structure's .expedited_sequence field: wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); This results in the wakeup being applied to the rcu_node structure's ->exp_wq[2] field, which is unfortunate given that Task B is instead waiting on ->exp_wq[1]. On a busy system, no harm is done (or at least no permanent harm is done). Some later expedited grace period will redo the wakeup. But on a quiet system, such as many embedded systems, it might be a good long time before there was another expedited grace period. On such embedded systems, this situation could therefore result in a system hang. This issue manifested as DPM device timeout during suspend (which usually qualifies as a quiet time) due to a SCSI device being stuck in _synchronize_rcu_expedited(), with the following stack trace: schedule() synchronize_rcu_expedited() synchronize_rcu() scsi_device_quiesce() scsi_bus_suspend() dpm_run_callback() __device_suspend() This commit therefore prevents such delays, timeouts, and hangs by making rcu_exp_wait_wake() use its "s" argument consistently instead of refetching from rcu_state.expedited_sequence. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b59c3ee42e5..fa143e40cd93 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -557,7 +557,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); mutex_unlock(&rcu_state.exp_wake_mutex); -- cgit v1.2.1 From 4bc6b745e5cbefed92c48071e28a5f41246d0470 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 11:50:52 -0800 Subject: rcu: Allow only one expedited GP to run concurrently with wakeups The current expedited RCU grace-period code expects that a task requesting an expedited grace period cannot awaken until that grace period has reached the wakeup phase. However, it is possible for a long preemption to result in the waiting task never sleeping. For example, consider the following sequence of events: 1. Task A starts an expedited grace period by invoking synchronize_rcu_expedited(). It proceeds normally up to the wait_event() near the end of that function, and is then preempted (or interrupted or whatever). 2. The expedited grace period completes, and a kworker task starts the awaken phase, having incremented the counter and acquired the rcu_state structure's .exp_wake_mutex. This kworker task is then preempted or interrupted or whatever. 3. Task A resumes and enters wait_event(), which notes that the expedited grace period has completed, and thus doesn't sleep. 4. Task B starts an expedited grace period exactly as did Task A, complete with the preemption (or whatever delay) just before the call to wait_event(). 5. The expedited grace period completes, and another kworker task starts the awaken phase, having incremented the counter. However, it blocks when attempting to acquire the rcu_state structure's .exp_wake_mutex because step 2's kworker task has not yet released it. 6. Steps 4 and 5 repeat, resulting in overflow of the rcu_node structure's ->exp_wq[] array. In theory, this is harmless. Tasks waiting on the various ->exp_wq[] array will just be spuriously awakened, but they will just sleep again on noting that the rcu_state structure's ->expedited_sequence value has not advanced far enough. In practice, this wastes CPU time and is an accident waiting to happen. This commit therefore moves the rcu_exp_gp_seq_end() call that officially ends the expedited grace period (along with associate tracing) until after the ->exp_wake_mutex has been acquired. This prevents Task A from awakening prematurely, thus preventing more than one expedited grace period from being in flight during a previous expedited grace period's wakeup phase. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay [ paulmck: Added updated comment. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fa143e40cd93..7a1f09376e62 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -539,14 +539,13 @@ static void rcu_exp_wait_wake(unsigned long s) struct rcu_node *rnp; synchronize_sched_expedited_wait(); - rcu_exp_gp_seq_end(); - trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ + // Switch over to wakeup mode, allowing the next GP to proceed. + // End the previous grace period only after acquiring the mutex + // to ensure that only one GP runs concurrently with wakeups. mutex_lock(&rcu_state.exp_wake_mutex); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { -- cgit v1.2.1 From 6c7d7dbf5b7f965eda0d39fbbb8fee005b08f340 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 13:59:37 -0800 Subject: rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() Now that the RCU flavors have been consolidated, there is one common function for checking to see if an expedited RCU grace period has completed, namely sync_rcu_preempt_exp_done(). Because this function is no longer specific to RCU-preempt, this commit removes the "_preempt" from its name. This commit also changes sync_rcu_preempt_exp_done_unlocked() to sync_rcu_exp_done_unlocked() for the same reason. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 19 +++++++++---------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 7a1f09376e62..3923c0743c3e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -148,7 +148,7 @@ static void __maybe_unused sync_exp_reset_tree(void) * * Caller must hold the specificed rcu_node structure's ->lock */ -static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); @@ -157,17 +157,16 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) } /* - * Like sync_rcu_preempt_exp_done(), but this function assumes the caller - * doesn't hold the rcu_node's ->lock, and will acquire and release the lock - * itself + * Like sync_rcu_exp_done(), but this function assumes the caller doesn't + * hold the rcu_node's ->lock, and will acquire and release the lock itself */ -static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { unsigned long flags; bool ret; raw_spin_lock_irqsave_rcu_node(rnp, flags); - ret = sync_rcu_preempt_exp_done(rnp); + ret = sync_rcu_exp_done(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ret; @@ -191,7 +190,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, unsigned long mask; for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { + if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else @@ -471,9 +470,9 @@ static void synchronize_sched_expedited_wait(void) for (;;) { ret = swait_event_timeout_exclusive( rcu_state.expedited_wq, - sync_rcu_preempt_exp_done_unlocked(rnp_root), + sync_rcu_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) + if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -507,7 +506,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done_unlocked(rnp)) + if (sync_rcu_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..6dbea4bcf065 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -485,7 +485,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); - empty_exp = sync_rcu_preempt_exp_done(rnp); + empty_exp = sync_rcu_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -509,7 +509,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = sync_rcu_preempt_exp_done(rnp); + empty_exp_now = sync_rcu_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, -- cgit v1.2.1 From de8cd0a533bfb57ff4ec6c85e3bdca013a5adcb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:20:41 -0800 Subject: rcu: Update tree_exp.h function-header comments The function-header comments in kernel/rcu/tree_exp.h have gotten a bit out of date, so this commit updates a number of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3923c0743c3e..1eafbcd56679 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void) } /* - * Return then value that expedited-grace-period counter will have + * Return the value that the expedited-grace-period counter will have * at the end of the current grace period. */ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) @@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void) } /* - * Take a snapshot of the expedited-grace-period counter. + * Take a snapshot of the expedited-grace-period counter, which is the + * earliest value that will indicate that a full grace period has + * elapsed since the current time. */ static unsigned long rcu_exp_gp_seq_snap(void) { @@ -143,22 +145,18 @@ static void __maybe_unused sync_exp_reset_tree(void) * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the specificed rcu_node structure's ->lock + * for the current expedited grace period. */ static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* - * Like sync_rcu_exp_done(), but this function assumes the caller doesn't - * hold the rcu_node's ->lock, and will acquire and release the lock itself + * Like sync_rcu_exp_done(), but where the caller does not hold the + * rcu_node's ->lock. */ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { @@ -180,8 +178,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) * which the task was queued or to one of that rcu_node structure's ancestors, * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) - * - * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) @@ -189,6 +185,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, { unsigned long mask; + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) @@ -452,6 +449,10 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, issuing any needed + * RCU CPU stall warnings along the way. + */ static void synchronize_sched_expedited_wait(void) { int cpu; @@ -781,7 +782,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() + * your code to batch your updates, and then use a single synchronize_rcu() * instead. * * This has the same semantics as (but is more brutal than) synchronize_rcu(). -- cgit v1.2.1 From 28f0361fdfab267a392cd6a6401446c9ea64de95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:24:58 -0800 Subject: rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" After RCU flavor consolidation, synchronize_sched_expedited_wait() does both RCU-preempt and RCU-sched, whichever happens to have been built into the running kernel. This commit therefore changes this function's name to synchronize_rcu_expedited_wait() to reflect its new generic nature. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1eafbcd56679..081a17942e57 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -453,7 +453,7 @@ static void sync_rcu_exp_select_cpus(void) * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ -static void synchronize_sched_expedited_wait(void) +static void synchronize_rcu_expedited_wait(void) { int cpu; unsigned long jiffies_stall; @@ -538,7 +538,7 @@ static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(); + synchronize_rcu_expedited_wait(); // Switch over to wakeup mode, allowing the next GP to proceed. // End the previous grace period only after acquiring the mutex -- cgit v1.2.1 From df1e849ae4559544ff00ff5052eefe2479750539 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 16:36:45 -0800 Subject: rcu: Enable tick for nohz_full CPUs slow to provide expedited QS An expedited grace period can be stalled by a nohz_full CPU looping in kernel context. This possibility is currently handled by some carefully crafted checks in rcu_read_unlock_special() that enlist help from ksoftirqd when permitted by the scheduler. However, it is exactly these checks that require the scheduler avoid holding any of its rq or pi locks across rcu_read_unlock() without also having held them across the entire RCU read-side critical section. It would therefore be very nice if expedited grace periods could handle nohz_full CPUs looping in kernel context without such checks. This commit therefore adds code to the expedited grace period's wait and cleanup code that forces the scheduler-clock interrupt on for CPUs that fail to quickly supply a quiescent state. "Quickly" is currently a hard-coded single-jiffy delay. Signed-off-by: Paul E. McKenney --- include/linux/tick.h | 5 ++++- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 52 ++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 7896f792d3b0..7340613c7eff 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -109,8 +109,10 @@ enum tick_dep_bits { TICK_DEP_BIT_PERF_EVENTS = 1, TICK_DEP_BIT_SCHED = 2, TICK_DEP_BIT_CLOCK_UNSTABLE = 3, - TICK_DEP_BIT_RCU = 4 + TICK_DEP_BIT_RCU = 4, + TICK_DEP_BIT_RCU_EXP = 5 }; +#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP #define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) @@ -118,6 +120,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) +#define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP) #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..f9253ed406ba 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -182,6 +182,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 081a17942e57..30b2a02aef39 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { + int cpu; unsigned long flags; + struct rcu_data *rdp; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { @@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, return; } WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = false; + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, within time limit. + * If the time limit is exceeded without the grace period elapsing, + * return false, otherwise return true. + */ +static bool synchronize_rcu_expedited_wait_once(long tlimit) +{ + int t; + struct rcu_node *rnp_root = rcu_get_root(); + + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, + sync_rcu_exp_done_unlocked(rnp_root), + tlimit); + // Workqueues should not be signaled. + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + return true; + WARN_ON(t < 0); /* workqueues should not be signaled. */ + return false; +} + /* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. @@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_start; unsigned long mask; int ndetected; + struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); - int ret; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (synchronize_rcu_expedited_wait_once(1)) + return; + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = true; + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } + } + WARN_ON_ONCE(1); + } for (;;) { - ret = swait_event_timeout_exclusive( - rcu_state.expedited_wq, - sync_rcu_exp_done_unlocked(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) continue; panic_on_rcu_stall(); -- cgit v1.2.1 From a3246cafecaaf7dac3f4e17a2bdfe0b504cbf682 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 5 Aug 2019 18:22:27 -0400 Subject: rcu: Add basic support for kfree_rcu() batching Recently a discussion about stability and performance of a system involving a high rate of kfree_rcu() calls surfaced on the list [1] which led to another discussion how to prepare for this situation. This patch adds basic batching support for kfree_rcu(). It is "basic" because we do none of the slab management, dynamic allocation, code moving or any of the other things, some of which previous attempts did [2]. These fancier improvements can be follow-up patches and there are different ideas being discussed in those regards. This is an effort to start simple, and build up from there. In the future, an extension to use kfree_bulk and possibly per-slab batching could be done to further improve performance due to cache-locality and slab-specific bulk free optimizations. By using an array of pointers, the worker thread processing the work would need to read lesser data since it does not need to deal with large rcu_head(s) any longer. Torture tests follow in the next patch and show improvements of around 5x reduction in number of grace periods on a 16 CPU system. More details and test data are in that patch. There is an implication with rcu_barrier() with this patch. Since the kfree_rcu() calls can be batched, and may not be handed yet to the RCU machinery in fact, the monitor may not have even run yet to do the queue_rcu_work(), there seems no easy way of implementing rcu_barrier() to wait for those kfree_rcu()s that are already made. So this means a kfree_rcu() followed by an rcu_barrier() does not imply that memory will be freed once rcu_barrier() returns. Another implication is higher active memory usage (although not run-away..) until the kfree_rcu() flooding ends, in comparison to without batching. More details about this are in the second patch which adds an rcuperf test. Finally, in the near future we will get rid of kfree_rcu() special casing within RCU such as in rcu_do_batch and switch everything to just batching. Currently we don't do that since timer subsystem is not yet up and we cannot schedule the kfree_rcu() monitor as the timer subsystem's lock are not initialized. That would also mean getting rid of kfree_call_rcu_nobatch() entirely. [1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-mst@kernel.org [2] https://lkml.org/lkml/2017/12/19/824 Cc: kernel-team@android.com Cc: kernel-team@lge.com Co-developed-by: Byungchul Park Signed-off-by: Byungchul Park Signed-off-by: Joel Fernandes (Google) [ paulmck: Applied 0day and Paul Walmsley feedback on ->monitor_todo. ] [ paulmck: Make it work during early boot. ] [ paulmck: Add a crude early boot self-test. ] [ paulmck: Style adjustments and experimental docbook structure header. ] Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.9999.1908161931110.32497@viisi.sifive.com/T/#me9956f66cb611b95d26ae92700e1d901f46e8c59 Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 ++ include/linux/rcutree.h | 2 + kernel/rcu/tree.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/update.c | 10 +++ 4 files changed, 206 insertions(+), 6 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 37b6f0c2b79d..1bd166aab6f3 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -39,6 +39,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) call_rcu(head, func); } +static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) +{ + call_rcu(head, func); +} + void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -85,6 +90,7 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } +static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c5147de885ec..6a65d3a16dbd 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,10 +34,12 @@ static inline void rcu_virt_note_context_switch(int cpu) void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); +void kfree_rcu_scheduler_running(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..0af016fdbf19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2683,19 +2683,187 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_work rcu_work; + struct rcu_head *head; + struct rcu_head *head_free; + spinlock_t lock; + struct delayed_work monitor_work; + int monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + + krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + spin_lock_irqsave(&krcp->lock, flags); + head = krcp->head_free; + krcp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + next = head->next; + // Potentially optimize with kfree_bulk in future. + __rcu_reclaim(rcu_state.name, head); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + lockdep_assert_held(&krcp->lock); + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (krcp->head_free) + return false; + + krcp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krcp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + if (!xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (xchg(&krcp->monitor_todo, false)) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. + * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). + */ +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, 1); } +EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). + */ +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + head->func = func; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); +} EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPT. @@ -3557,12 +3725,26 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..196487762b96 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -218,6 +219,7 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } @@ -853,14 +855,22 @@ static void test_callback(struct rcu_head *r) DEFINE_STATIC_SRCU(early_srcu); +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; + static void early_boot_test_call_rcu(void) { static struct rcu_head head; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); if (IS_ENABLED(CONFIG_SRCU)) call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) -- cgit v1.2.1 From be7188c96444173a4ada001638c199d10e9c39cd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:29 -0400 Subject: rcuperf: Add kfree_rcu() performance Tests This test runs kfree_rcu() in a loop to measure performance of the new kfree_rcu() batching functionality. The following table shows results when booting with arguments: rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000 rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X rcuperf.kfree_no_batch=X # Grace Periods Test Duration (s) X=1 (old behavior) 9133 11.5 X=0 (new behavior) 1732 12.5 On a 16 CPU system with the above boot parameters, we see that the total number of grace periods that elapse during the test drops from 9133 when not batching to 1732 when batching (a 5X improvement). The kfree_rcu() flood itself slows down a bit when batching, though, as shown. Note that the active memory consumption during the kfree_rcu() flood does increase to around 200-250MB due to the batching (from around 50MB without batching). However, this memory consumption is relatively constant. In other words, the system is able to keep up with the kfree_rcu() load. The memory consumption comes down considerably if KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will reduce memory consumption further by using multiple lists. Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and CONFIG_PROVE_RCU for realistic comparisons with/without batching. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 17 +++ kernel/rcu/rcuperf.c | 181 ++++++++++++++++++++++-- 2 files changed, 190 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ade4e6ec23e0..3ce270b56f3a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3978,6 +3978,23 @@ test until boot completes in order to avoid interference. + rcuperf.kfree_rcu_test= [KNL] + Set to measure performance of kfree_rcu() flooding. + + rcuperf.kfree_nthreads= [KNL] + The number of threads running loops of kfree_rcu(). + + rcuperf.kfree_alloc_num= [KNL] + Number of allocations and frees done in an iteration. + + rcuperf.kfree_loops= [KNL] + Number of loops doing rcuperf.kfree_alloc_num number + of allocations and frees. + + rcuperf.kfree_no_batch= [KNL] + Use the non-batching (less efficient) version of kfree_rcu(). + This is useful for comparing with the batched version. + rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5f884d560384..c1e25fd10f2a 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished; static wait_queue_head_t shutdown_wq; static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_perf_writer_started; -static unsigned long b_rcu_perf_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 @@ -378,10 +379,10 @@ rcu_perf_writer(void *arg) if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; if (gp_exp) { - b_rcu_perf_writer_started = + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = cur_ops->get_gp_seq(); + b_rcu_gp_test_started = cur_ops->get_gp_seq(); } } @@ -429,10 +430,10 @@ retry: PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); } if (shutdown) { @@ -515,8 +516,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - rcuperf_seq_diff(b_rcu_perf_writer_finished, - b_rcu_perf_writer_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -584,6 +585,167 @@ rcu_perf_shutdown(void *arg) return -EINVAL; } +/* + * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_perf_thread_started; +static atomic_t n_kfree_perf_thread_ended; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +static int +kfree_perf_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + if (!kfree_no_batch) { + kfree_rcu(alloc_ptr, rh); + } else { + rcu_callback_t cb; + + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); + kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); + } + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_perf_thread"); + return 0; +} + +static void +kfree_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_perf_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= + kfree_nrealthreads); + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +kfree_perf_init(void) +{ + long i; + int firsterr = 0; + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + kfree_reader_tasks[i]); + if (firsterr) + goto unwind; + } + + while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_perf_cleanup(); + return firsterr; +} + static int __init rcu_perf_init(void) { @@ -616,6 +778,9 @@ rcu_perf_init(void) if (cur_ops->init) cur_ops->init(); + if (kfree_rcu_test) + return kfree_perf_init(); + nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); atomic_set(&n_rcu_perf_reader_started, 0); -- cgit v1.2.1 From 502b7d8b111b16a1d8eb090854712ee07b00fff3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 22 Sep 2019 10:49:57 -0700 Subject: rcu: Make kfree_rcu() use a non-atomic ->monitor_todo Because the ->monitor_todo field is always protected by krcp->lock, this commit downgrades from xchg() to non-atomic unmarked assignment statements. Signed-off-by: Joel Fernandes [ paulmck: Update to include early-boot kick code. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0af016fdbf19..6106b9e0b5fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2708,7 +2708,7 @@ struct kfree_rcu_cpu { struct rcu_head *head_free; spinlock_t lock; struct delayed_work monitor_work; - int monitor_todo; + bool monitor_todo; bool initialized; }; @@ -2765,6 +2765,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { // Attempt to start a new batch. + krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. spin_unlock_irqrestore(&krcp->lock, flags); @@ -2772,8 +2773,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, } // Previous RCU batch still in progress, try again later. - if (!xchg(&krcp->monitor_todo, true)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } @@ -2788,7 +2789,7 @@ static void kfree_rcu_monitor(struct work_struct *work) monitor_work.work); spin_lock_irqsave(&krcp->lock, flags); - if (xchg(&krcp->monitor_todo, false)) + if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else spin_unlock_irqrestore(&krcp->lock, flags); @@ -2837,8 +2838,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !xchg(&krcp->monitor_todo, true)) + !krcp->monitor_todo) { + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } if (krcp->initialized) spin_unlock(&krcp->lock); @@ -2855,10 +2858,11 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + if (!krcp->head || krcp->monitor_todo) { spin_unlock_irqrestore(&krcp->lock, flags); continue; } + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } -- cgit v1.2.1 From e38fa01b94c87dfa945afa603ed50b4f7955934b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 19 Sep 2019 14:58:26 -0700 Subject: rcu: Add multiple in-flight batches of kfree_rcu() work During testing, it was observed that amount of memory consumed due kfree_rcu() batching is 300-400MB. Previously we had only a single head_free pointer pointing to the list of rcu_head(s) that are to be freed after a grace period. Until this list is drained, we cannot queue any more objects on it since such objects may not be ready to be reclaimed when the worker thread eventually gets to drainin g the head_free list. We can do better by maintaining multiple lists as done by this patch. Testing shows that memory consumption came down by around 100-150MB with just adding another list. Adding more than 1 additional list did not show any improvement. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Code style and initialization handling. ] [ paulmck: Fix field name, reported by kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6106b9e0b5fb..a40fd58bd4b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2686,12 +2686,25 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 /** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending @@ -2703,9 +2716,8 @@ EXPORT_SYMBOL_GPL(call_rcu); * the interactions with the slab allocators. */ struct kfree_rcu_cpu { - struct rcu_work rcu_work; struct rcu_head *head; - struct rcu_head *head_free; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; @@ -2723,11 +2735,14 @@ static void kfree_rcu_work(struct work_struct *work) unsigned long flags; struct rcu_head *head, *next; struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; - krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; spin_lock_irqsave(&krcp->lock, flags); - head = krcp->head_free; - krcp->head_free = NULL; + head = krwp->head_free; + krwp->head_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); // List "head" is now private, so traverse locklessly. @@ -2747,17 +2762,25 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } // If a previous RCU batch is in progress, we cannot immediately // queue another one, so return false to tell caller to retry. - if (krcp->head_free) + if (!krwp) return false; - krcp->head_free = krcp->head; + krwp->head_free = krcp->head; krcp->head = NULL; - INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krcp->rcu_work); + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); return true; } @@ -2863,7 +2886,8 @@ void __init kfree_rcu_scheduler_running(void) continue; } krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3732,11 +3756,14 @@ struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; + int i; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.1 From 08fcb027b2105991058b5554dffad97f9e3cd7c5 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Sep 2019 13:03:17 -0700 Subject: rcu: Add support for debug_objects debugging for kfree_rcu() This commit applies RCU's debug_objects debugging to the new batched kfree_rcu() implementations. The object is queued at the kfree_rcu() call and dequeued during reclaim. Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects double kfree_rcu() calls. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix IRQ per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a40fd58bd4b6..0512221cd84b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2749,6 +2749,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; head; head = next) { next = head->next; // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); __rcu_reclaim(rcu_state.name, head); cond_resched_tasks_rcu_qs(); } @@ -2855,6 +2856,12 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } head->func = func; head->next = krcp->head; krcp->head = head; @@ -2866,6 +2873,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); } +unlock_return: if (krcp->initialized) spin_unlock(&krcp->lock); local_irq_restore(flags); -- cgit v1.2.1 From 86f8154300411e35d5b723fa8165817598fd3870 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:32 -0400 Subject: rcu: Remove kfree_rcu() special casing and lazy-callback handling This commit removes kfree_rcu() special-casing and the lazy-callback handling from Tree RCU. It moves some of this special casing to Tiny RCU, the removal of which will be the subject of later commits. This results in a nice negative delta. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Add slab.h #include, thanks to kbuild test robot . ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 11 +++------- include/linux/rcu_segcblist.h | 2 -- include/trace/events/rcu.h | 32 +++++++++++---------------- kernel/rcu/rcu.h | 27 ----------------------- kernel/rcu/rcu_segcblist.c | 25 +++------------------ kernel/rcu/rcu_segcblist.h | 25 ++------------------- kernel/rcu/srcutree.c | 4 ++-- kernel/rcu/tiny.c | 28 +++++++++++++++++++++++- kernel/rcu/tree.c | 40 +++++++++++++++++++++++----------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 48 ++++++++++------------------------------- kernel/rcu/tree_stall.h | 6 ++---- 12 files changed, 90 insertions(+), 159 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f48f4621ccbc..a360a8796710 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed for each CPU: - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 The "last_accelerate:" prints the low-order 16 bits (in hex) of the jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback -status, so that an "l" indicates that all callbacks were lazy at the start -of the last idle period and an "L" indicates that there are currently -no non-lazy callbacks (in both cases, "." is printed otherwise, as -shown above) and "D" indicates that dyntick-idle processing is enabled -("." is printed otherwise, for example, if disabled via the "nohz=" -kernel boot parameter). +rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle +processing is enabled. If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 646759042333..b36afe7b22c9 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -22,7 +22,6 @@ struct rcu_cblist { struct rcu_head *head; struct rcu_head **tail; long len; - long len_lazy; }; #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } @@ -73,7 +72,6 @@ struct rcu_segcblist { #else long len; #endif - long len_lazy; u8 enabled; u8 offloaded; }; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 66122602bd08..4ab16fcda895 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -481,16 +481,14 @@ TRACE_EVENT_RCU(rcu_dyntick, */ TRACE_EVENT_RCU(rcu_callback, - TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy, - long qlen), + TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen), - TP_ARGS(rcuname, rhp, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(void *, func) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -498,13 +496,12 @@ TRACE_EVENT_RCU(rcu_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->func = rhp->func; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ps %ld/%ld", + TP_printk("%s rhp=%p func=%ps %ld", __entry->rcuname, __entry->rhp, __entry->func, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -518,15 +515,14 @@ TRACE_EVENT_RCU(rcu_callback, TRACE_EVENT_RCU(rcu_kfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen_lazy, long qlen), + long qlen), - TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, offset, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -534,13 +530,12 @@ TRACE_EVENT_RCU(rcu_kfree_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->offset = offset; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ld %ld/%ld", + TP_printk("%s rhp=%p func=%ld %ld", __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -552,27 +547,24 @@ TRACE_EVENT_RCU(rcu_kfree_callback, */ TRACE_EVENT_RCU(rcu_batch_start, - TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit), + TP_PROTO(const char *rcuname, long qlen, long blimit), - TP_ARGS(rcuname, qlen_lazy, qlen, blimit), + TP_ARGS(rcuname, qlen, blimit), TP_STRUCT__entry( __field(const char *, rcuname) - __field(long, qlen_lazy) __field(long, qlen) __field(long, blimit) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; __entry->blimit = blimit; ), - TP_printk("%s CBs=%ld/%ld bl=%ld", - __entry->rcuname, __entry->qlen_lazy, __entry->qlen, - __entry->blimit) + TP_printk("%s CBs=%ld bl=%ld", + __entry->rcuname, __entry->qlen, __entry->blimit) ); /* diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..c30a1f7dbd15 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); - -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback(rn, head, offset); - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - trace_rcu_invoke_callback(rn, head); - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } -} - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index cbc87b804db9..5f4fd3b8777c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; } /* * Enqueue an rcu_head structure onto the specified callback list. - * This function assumes that the callback is non-lazy because it - * is intended for use by no-CBs CPUs, which do not distinguish - * between lazy and non-lazy RCU callbacks. */ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) { @@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, else drclp->tail = &drclp->head; drclp->len = srclp->len; - drclp->len_lazy = srclp->len_lazy; if (!rhp) { rcu_cblist_init(srclp); } else { @@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, srclp->head = rhp; srclp->tail = &rhp->next; WRITE_ONCE(srclp->len, 1); - srclp->len_lazy = 0; } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; rcu_segcblist_set_len(rsclp, 0); - rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->enabled = 0; } @@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); @@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) @@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rclp->len_lazy += rsclp->len_lazy; - rsclp->len_lazy = 0; rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } @@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 815c2fdd3fcc..5c293afc07b8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; -} - /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? @@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..d0a9d5b69087 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -853,7 +853,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, local_irq_save(flags); sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -1052,7 +1052,7 @@ void srcu_barrier(struct srcu_struct *ssp) sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { + &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_barrier_cpu_cnt); } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 477b4eb44af5..dd572ce7c747 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "rcu.h" @@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user) } } +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + unsigned long offset = (unsigned long)head->func; + + rcu_lock_acquire(&rcu_callback_map); + if (__is_kfree_rcu_offset(offset)) { + trace_rcu_invoke_kfree_callback("", head, offset); + kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); + return true; + } + + trace_rcu_invoke_callback("", head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; +} + /* Invoke the RCU callbacks whose grace period has elapsed. */ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { @@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim("", list); + rcu_reclaim_tiny(list); local_bh_enable(); list = next; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0512221cd84b..a8dd612098bf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include "../time/tick-internal.h" @@ -2146,7 +2147,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2168,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2178,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2583,7 +2592,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2627,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,7 +2687,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2747,10 +2755,18 @@ static void kfree_rcu_work(struct work_struct *work) // List "head" is now private, so traverse locklessly. for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + next = head->next; // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); - __rcu_reclaim(rcu_state.name, head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + /* Could be possible to optimize with kfree_bulk in future */ + kfree((void *)head - offset); + + rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); } } @@ -2825,7 +2841,7 @@ static void kfree_rcu_monitor(struct work_struct *work) */ void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); @@ -3100,7 +3116,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..15405420b40c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -183,7 +183,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..d5334e49ccca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1262,10 +1262,9 @@ static void rcu_prepare_for_idle(void) /* * This code is invoked when a CPU goes idle, at which point we want * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. + * the energy-efficient dyntick-idle mode. * - * The following three proprocessor symbols control this state machine: + * The following preprocessor symbol controls this: * * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This @@ -1274,21 +1273,15 @@ static void rcu_prepare_for_idle(void) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. * - * The values below work well in practice. If future workloads require + * The value below works well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); /* * Try to advance callbacks on the current CPU, but only if it has been @@ -1327,8 +1320,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. + * caller about what to set the timeout. * * The caller must have disabled interrupts. */ @@ -1354,25 +1346,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) } rdp->last_accelerate = jiffies; - /* Request timer delay depending on laziness, and round. */ - rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); - if (rdp->all_lazy) { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } else { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } + /* Request timer and round. */ + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; + *nextevt = basemono + dj * TICK_NSEC; return 0; } /* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. + * Prepare a CPU for idle from an RCU perspective. The first major task is to + * sense whether nohz mode has been enabled or disabled via sysfs. The second + * major task is to accelerate (that is, assign grace-period numbers to) any + * recently arrived callbacks. * * The caller must have disabled interrupts. */ @@ -1398,17 +1383,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { - rdp->all_lazy = false; - invoke_rcu_core(); - return; - } - /* * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..806f2ddc8f74 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!!rdp->tick_nohz_enabled_snap]); + !!rdp->tick_nohz_enabled_snap); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.1 From c1f680af911bf7e5774af8568ad2a66222130cee Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:33 -0400 Subject: rcu: Remove kfree_call_rcu_nobatch() Now that the kfree_rcu() special-casing has been removed from tree RCU, this commit removes kfree_call_rcu_nobatch() since it is no longer needed. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ---- include/linux/rcutiny.h | 5 ----- include/linux/rcutree.h | 1 - kernel/rcu/rcuperf.c | 10 +--------- kernel/rcu/tree.c | 18 ++++-------------- 5 files changed, 5 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3ce270b56f3a..ed83d6d90cc3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3991,10 +3991,6 @@ Number of loops doing rcuperf.kfree_alloc_num number of allocations and frees. - rcuperf.kfree_no_batch= [KNL] - Use the non-batching (less efficient) version of kfree_rcu(). - This is useful for comparing with the batched version. - rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 1bd166aab6f3..b2b2dc990da9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -39,11 +39,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) call_rcu(head, func); } -static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - call_rcu(head, func); -} - void rcu_qs(void); static inline void rcu_softirq_qs(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 6a65d3a16dbd..2f787b9029d1 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,7 +34,6 @@ static inline void rcu_virt_note_context_switch(int cpu) void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c1e25fd10f2a..da94b89cd531 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); -torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -632,14 +631,7 @@ kfree_perf_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - if (!kfree_no_batch) { - kfree_rcu(alloc_ptr, rh); - } else { - rcu_callback_t cb; - - cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); - kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); - } + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a8dd612098bf..31d2d9255d95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2763,8 +2763,10 @@ static void kfree_rcu_work(struct work_struct *work) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - /* Could be possible to optimize with kfree_bulk in future */ - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2835,16 +2837,6 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } -/* - * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. - * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). - */ -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); - /* * Queue a request for lazy invocation of kfree() after a grace period. * @@ -2864,8 +2856,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unsigned long flags; struct kfree_rcu_cpu *krcp; - head->func = func; - local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) -- cgit v1.2.1 From f452ee096d95482892b101bde4fd037fa025d3cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Fri, 4 Oct 2019 23:54:02 +0200 Subject: rculist: Describe variadic macro argument in a Sphinx-compatible way MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this patch, Sphinx shows "variable arguments" as the description of the cond argument, rather than the intended description, and prints the following warnings: ./include/linux/rculist.h:374: warning: Excess function parameter 'cond' description in 'list_for_each_entry_rcu' ./include/linux/rculist.h:651: warning: Excess function parameter 'cond' description in 'hlist_for_each_entry_rcu' Signed-off-by: Jonathan Neuschäfer Acked-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4158b7212936..61c6728a71f7 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -361,7 +361,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond: optional lockdep expression if called from non-RCU protection. + * @cond...: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -636,7 +636,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond: optional lockdep expression if called from non-RCU protection. + * @cond...: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() -- cgit v1.2.1 From c54a2744497db4b6887b9c905ef7aa0b3620c956 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:37:37 -0800 Subject: list: Add hlist_unhashed_lockless() We would like to use hlist_unhashed() from timer_pending(), which runs without protection of a lock. Note that other callers might also want to use this variant. Instead of forcing a READ_ONCE() for all hlist_unhashed() callers, add a new helper with an explicit _lockless suffix in the name to better document what is going on. Also add various WRITE_ONCE() in __hlist_del(), hlist_add_head() and hlist_add_before()/hlist_add_behind() to pair with the READ_ONCE(). Signed-off-by: Eric Dumazet Cc: Thomas Gleixner [ paulmck: Also add WRITE_ONCE() to rculist.h. ] Signed-off-by: Paul E. McKenney --- include/linux/list.h | 32 +++++++++++++++++++++----------- include/linux/rculist.h | 24 ++++++++++++------------ 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 85c92555e31f..61f5aaf96192 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -749,6 +749,16 @@ static inline int hlist_unhashed(const struct hlist_node *h) return !h->pprev; } +/* This variant of hlist_unhashed() must be used in lockless contexts + * to avoid potential load-tearing. + * The READ_ONCE() is paired with the various WRITE_ONCE() in hlist + * helpers that are defined below. + */ +static inline int hlist_unhashed_lockless(const struct hlist_node *h) +{ + return !READ_ONCE(h->pprev); +} + static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); @@ -761,7 +771,7 @@ static inline void __hlist_del(struct hlist_node *n) WRITE_ONCE(*pprev, next); if (next) - next->pprev = pprev; + WRITE_ONCE(next->pprev, pprev); } static inline void hlist_del(struct hlist_node *n) @@ -782,32 +792,32 @@ static inline void hlist_del_init(struct hlist_node *n) static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; - n->next = first; + WRITE_ONCE(n->next, first); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { - n->pprev = next->pprev; - n->next = next; - next->pprev = &n->next; + WRITE_ONCE(n->pprev, next->pprev); + WRITE_ONCE(n->next, next); + WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { - n->next = prev->next; - prev->next = n; - n->pprev = &prev->next; + WRITE_ONCE(n->next, prev->next); + WRITE_ONCE(prev->next, n); + WRITE_ONCE(n->pprev, &prev->next); if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); } /* after that we'll appear to be on some hlist and hlist_del will work */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 61c6728a71f7..4b7ae1bf50b3 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -173,7 +173,7 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); - n->pprev = NULL; + WRITE_ONCE(n->pprev, NULL); } } @@ -473,7 +473,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** @@ -489,11 +489,11 @@ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *next = old->next; new->next = next; - new->pprev = old->pprev; + WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) - new->next->pprev = &new->next; - old->pprev = LIST_POISON2; + WRITE_ONCE(new->next->pprev, &new->next); + WRITE_ONCE(old->pprev, LIST_POISON2); } /* @@ -528,10 +528,10 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } /** @@ -564,7 +564,7 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, if (last) { n->next = last->next; - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); @@ -592,10 +592,10 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { - n->pprev = next->pprev; + WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); - next->pprev = &n->next; + WRITE_ONCE(next->pprev, &n->next); } /** @@ -620,10 +620,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - n->pprev = &prev->next; + WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ -- cgit v1.2.1 From 610dea36d3083a977e4f156206cbe1eaa2a532f0 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Fri, 4 Oct 2019 19:49:10 +0000 Subject: rcu/nocb: Fix dump_tree hierarchy print always active Commit 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") added print statements to rcu_organize_nocb_kthreads for debugging, but incorrectly guarded them, causing the function to always spew out its message. This patch fixes it by guarding both pr_alert statements with dump_tree, while also changing the second pr_alert to a pr_cont, to print the hierarchy in a single line (assuming that's how it was supposed to work). Fixes: 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") Signed-off-by: Stefan Reiter [ paulmck: Make single-nocbs-CPU GP kthreads look less erroneous. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..758bfe1de536 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2321,6 +2321,8 @@ static void __init rcu_organize_nocb_kthreads(void) { int cpu; bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; @@ -2343,21 +2345,31 @@ static void __init rcu_organize_nocb_kthreads(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; - if (!firsttime && dump_tree) - pr_cont("\n"); - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } } else { /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; - pr_alert(" %d", cpu); + if (dump_tree) + pr_cont(" %d", cpu); } rdp_prev = rdp; } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); } /* -- cgit v1.2.1 From 6935c3983b246d5fbfebd3b891c825e65c118f2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:21:54 -0700 Subject: rcu: Avoid data-race in rcu_gp_fqs_check_wake() The rcu_gp_fqs_check_wake() function uses rcu_preempt_blocked_readers_cgp() to read ->gp_tasks while other cpus might overwrite this field. We need READ_ONCE()/WRITE_ONCE() pairs to avoid compiler tricks and KCSAN splats like the following : BUG: KCSAN: data-race in rcu_gp_fqs_check_wake / rcu_preempt_deferred_qs_irqrestore write to 0xffffffff85a7f190 of 8 bytes by task 7317 on cpu 0: rcu_preempt_deferred_qs_irqrestore+0x43d/0x580 kernel/rcu/tree_plugin.h:507 rcu_read_unlock_special+0xec/0x370 kernel/rcu/tree_plugin.h:659 __rcu_read_unlock+0xcf/0xe0 kernel/rcu/tree_plugin.h:394 rcu_read_unlock include/linux/rcupdate.h:645 [inline] __ip_queue_xmit+0x3b0/0xa40 net/ipv4/ip_output.c:533 ip_queue_xmit+0x45/0x60 include/net/ip.h:236 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575 tcp_recvmsg+0x633/0x1a30 net/ipv4/tcp.c:2179 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 read to 0xffffffff85a7f190 of 8 bytes by task 10 on cpu 1: rcu_gp_fqs_check_wake kernel/rcu/tree.c:1556 [inline] rcu_gp_fqs_check_wake+0x93/0xd0 kernel/rcu/tree.c:1546 rcu_gp_fqs_loop+0x36c/0x580 kernel/rcu/tree.c:1611 rcu_gp_kthread+0x143/0x220 kernel/rcu/tree.c:1768 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 10 Comm: rcu_preempt Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot [ paulmck: Added another READ_ONCE() for RCU CPU stall warnings. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 758bfe1de536..fe5f44811761 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { - rnp->gp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) @@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - return rnp->gp_tasks != NULL; + return READ_ONCE(rnp->gp_tasks) != NULL; } /* Bias and limit values for ->rcu_read_lock_nesting. */ @@ -493,7 +493,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; + WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { @@ -663,7 +663,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { - rnp->gp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), @@ -757,7 +757,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { -- cgit v1.2.1 From 03bd2983d7a9f898fd89f8f7215c3e56732d8ecd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Oct 2019 09:05:27 -0700 Subject: rcu: Use lockdep rather than comment to enforce lock held The rcu_preempt_check_blocked_tasks() function has a comment that states that the rcu_node structure's ->lock must be held, which might be informative, but which carries little weight if not read. This commit therefore removes this comment in favor of raw_lockdep_assert_held_rcu_node(), which will complain quite visibly if the required lock is not held. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe5f44811761..ed54d36465e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -648,8 +648,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock - * must be held by the caller. + * invoked -before- updating this rnp's ->gp_seq. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. @@ -659,6 +658,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); + raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && -- cgit v1.2.1 From b3e627d3d5092a87fc9b9e37e341610cfecfbfdc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 02:55:57 +0000 Subject: rcu: Make PREEMPT_RCU be a modifier to TREE_RCU Currently PREEMPT_RCU and TREE_RCU are mutually exclusive Kconfig options. But PREEMPT_RCU actually specifies a kind of TREE_RCU, namely a preemptible TREE_RCU. This commit therefore makes PREEMPT_RCU be a modifer to the TREE_RCU Kconfig option. This has the benefit of simplifying several of the #if expressions that formerly needed to check both, but now need only check one or the other. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/trace/events/rcu.h | 4 ++-- kernel/rcu/Kconfig | 13 +++++++------ kernel/rcu/Makefile | 1 - kernel/rcu/rcu.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sysctl.c | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0b7506330c87..70a41cd8f58d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -167,7 +167,7 @@ do { \ * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) #include #elif defined(CONFIG_TINY_RCU) #include @@ -601,7 +601,7 @@ do { \ * read-side critical section that would block in a !PREEMPT kernel. * But if you want the full story, read on! * - * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), + * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 66122602bd08..85019cf4ed6c 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -41,7 +41,7 @@ TRACE_EVENT(rcu_utilization, TP_printk("%s", __entry->s) ); -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) /* * Tracepoint for grace-period events. Takes a string identifying the @@ -432,7 +432,7 @@ TRACE_EVENT_RCU(rcu_fqs, __entry->cpu, __entry->qsevent) ); -#endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) */ +#endif /* #if defined(CONFIG_TREE_RCU) */ /* * Tracepoint for dyntick-idle entry/exit events. These take a string diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 7644eda17d62..0303934e6ef0 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPTION && SMP + default y if SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -17,6 +17,7 @@ config TREE_RCU config PREEMPT_RCU bool default y if PREEMPTION + select TREE_RCU help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -78,7 +79,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU ) + def_bool TREE_RCU help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow @@ -86,13 +87,13 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 64 if 64BIT default 32 if !64BIT help @@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 16 help This option controls the leaf-level fanout of hierarchical @@ -187,7 +188,7 @@ config RCU_BOOST_DELAY config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU depends on RCU_EXPERT || NO_HZ_FULL default n help diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 020e8b6a644b..82d5fba48b2f 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..eabafde2349e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -454,7 +454,7 @@ enum rcutorture_type { INVALID_RCU_FLAVOR }; -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..34a7452b25fd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 70665934d53e..d396aaaf19a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_do_static_key, }, #endif -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, -- cgit v1.2.1 From 90326f0521a88004194f88f1b597b54347482b5c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:14 +0200 Subject: rcu: Use CONFIG_PREEMPTION where appropriate The config option `CONFIG_PREEMPT' is used for the preemption model "Low-Latency Desktop". The config option `CONFIG_PREEMPTION' is enabled when kernel preemption is enabled which is true for the preemption model `CONFIG_PREEMPT' and `CONFIG_PREEMPT_RT'. Use `CONFIG_PREEMPTION' if it applies to both preemption models and not just to `CONFIG_PREEMPT'. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Davidlohr Bueso Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/Kconfig | 4 ++-- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/srcutiny.c | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 70a41cd8f58d..eb32fff81c30 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -154,7 +154,7 @@ static inline void exit_tasks_rcu_finish(void) { } * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() - * machinery were to be shut off, as some advocate for PREEMPT kernels. + * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ @@ -598,7 +598,7 @@ do { \ * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU - * read-side critical section that would block in a !PREEMPT kernel. + * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0303934e6ef0..1cc940fef17c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -201,8 +201,8 @@ config RCU_NOCB_CPU specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running + the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched + (!PREEMPTION kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..121a0507a7ce 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1730,7 +1730,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) // Give the scheduler a chance, even on nohz_full CPUs. static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { - if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 44d6606b8325..6208c1dae5c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation + * that become ready as a result. Single-CPU and !PREEMPTION operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..c9dbb05e4c13 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2698,9 +2698,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..98d078cafa5a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -670,7 +670,7 @@ static void rcu_exp_handler(void *unused) } } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ static void sync_sched_exp_online_cleanup(int cpu) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed54d36465e2..8cdce111ea73 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -789,7 +789,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -839,7 +839,7 @@ void rcu_all_qs(void) EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { -- cgit v1.2.1 From 8356cdcfb5e40d8a7586ea7b157fc7f815abcdb9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:48 +0000 Subject: rcu: Rename some instance of CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU CONFIG_PREEMPTION and CONFIG_PREEMPT_RCU are always identical, but some code depends on CONFIG_PREEMPTION to access to rcu_preempt functionality. This patch changes CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU in these cases. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_stall.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c9dbb05e4c13..5445da2326a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1934,7 +1934,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2294,7 +2294,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..a6652efedd48 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RCU /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPTION */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPTION */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* * Dump stacks of all tasks running on stalled CPUs. First try using -- cgit v1.2.1 From 163b89bb565e26c90d01fec806ba39b2fe448f72 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 04:06:22 -0700 Subject: rcu: Clear .exp_hint only when deferred quiescent state has been reported Currently, the .exp_hint flag is cleared in rcu_read_unlock_special(), which works, but which can also prevent subsequent rcu_read_unlock() calls from helping expedite the quiescent state needed by an ongoing expedited RCU grace period. This commit therefore defers clearing of .exp_hint from rcu_read_unlock_special() to rcu_preempt_deferred_qs_irqrestore(), thus ensuring that intervening calls to rcu_read_unlock() have a chance to help end the expedited grace period. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8cdce111ea73..7487c7930a47 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,6 +444,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.exp_hint = false; t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); @@ -610,7 +611,6 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - t->rcu_read_unlock_special.b.exp_hint = false; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); @@ -640,7 +640,6 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); return; } - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } -- cgit v1.2.1 From bed37c63c7e910bd8216f4be4b1eba3ec2eb4c19 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 05:06:21 -0700 Subject: rcu: Clear ->rcu_read_unlock_special only once In rcu_preempt_deferred_qs_irqrestore(), ->rcu_read_unlock_special is cleared one piece at a time. Given that the "if" statements in this function use the copy in "special", this commit removes the clearing of the individual pieces in favor of clearing ->rcu_read_unlock_special in one go just after it has been determined to be non-zero. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7487c7930a47..c3a32717c42d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,16 +444,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } - t->rcu_read_unlock_special.b.exp_hint = false; - t->rcu_read_unlock_special.b.deferred_qs = false; - if (special.b.need_qs) { + t->rcu_read_unlock_special.s = 0; + if (special.b.need_qs) rcu_qs(); - t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { - local_irq_restore(flags); - return; - } - } /* * Respond to a request by an expedited grace period for a @@ -461,17 +454,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) { + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!t->rcu_read_unlock_special.s) { - local_irq_restore(flags); - return; - } - } /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { - t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The task -- cgit v1.2.1 From 28b605c3f828b30a49d91e5c9c8388fd056c2059 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:22:45 -0800 Subject: rcu: Use READ_ONCE() for ->expmask in rcu_read_unlock_special() The rcu_node structure's ->expmask field is updated only when holding the ->lock, but is also accessed locklessly. This means that all ->expmask updates must use WRITE_ONCE() and all reads carried out without holding ->lock must use READ_ONCE(). This commit therefore changes the lockless ->expmask read in rcu_read_unlock_special() to use READ_ONCE(). Reported-by: syzbot+99f4ddade3c22ab0cf23@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c3a32717c42d..698a7f195f88 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -599,7 +599,7 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_node *rnp = rdp->mynode; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & rnp->expmask) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. if (irqs_were_disabled && use_softirq && -- cgit v1.2.1 From c1d9101132d00ee9189ff6744dfd1257730a135c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 15 Nov 2019 14:08:53 -0800 Subject: rcu: Provide wrappers for uses of ->rcu_read_lock_nesting This commit provides wrapper functions for uses of ->rcu_read_lock_nesting to improve readability and to ease future changes to support inlining of __rcu_read_lock() and __rcu_read_unlock(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_plugin.h | 53 +++++++++++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 98d078cafa5a..d8da6b1a3209 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -610,7 +610,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!t->rcu_read_lock_nesting) { + if (!rcu_preempt_depth()) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -634,7 +634,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (t->rcu_read_lock_nesting > 0) { + if (rcu_preempt_depth() > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 698a7f195f88..ebdbdec5911f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); - if (t->rcu_read_lock_nesting > 0 && + WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ @@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #define RCU_NEST_NMAX (-INT_MAX / 2) #define RCU_NEST_PMAX (INT_MAX / 2) +static void rcu_preempt_read_enter(void) +{ + current->rcu_read_lock_nesting++; +} + +static void rcu_preempt_read_exit(void) +{ + current->rcu_read_lock_nesting--; +} + +static void rcu_preempt_depth_set(int val) +{ + current->rcu_read_lock_nesting = val; +} + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) */ void __rcu_read_lock(void) { - current->rcu_read_lock_nesting++; + rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); + WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -373,19 +388,19 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; + if (rcu_preempt_depth() != 1) { + rcu_preempt_read_exit(); } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = -RCU_NEST_BIAS; + rcu_preempt_depth_set(-RCU_NEST_BIAS); barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; + rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - int rrln = t->rcu_read_lock_nesting; + int rrln = rcu_preempt_depth(); WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } @@ -539,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - t->rcu_read_lock_nesting <= 0; + rcu_preempt_depth() <= 0; } /* @@ -552,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = t->rcu_read_lock_nesting >= 0; + bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -672,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user) if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } - if (t->rcu_read_lock_nesting > 0 || + if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ if (rcu_preempt_need_deferred_qs(t)) { @@ -682,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!t->rcu_read_lock_nesting) { + } else if (!rcu_preempt_depth()) { rcu_qs(); /* Report immediate QS. */ return; } /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ - if (t->rcu_read_lock_nesting > 0 && + if (rcu_preempt_depth() > 0 && __this_cpu_read(rcu_data.core_needs_qs) && __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && @@ -709,11 +724,11 @@ void exit_rcu(void) struct task_struct *t = current; if (unlikely(!list_empty(¤t->rcu_node_entry))) { - t->rcu_read_lock_nesting = 1; + rcu_preempt_depth_set(1); barrier(); WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); - } else if (unlikely(t->rcu_read_lock_nesting)) { - t->rcu_read_lock_nesting = 1; + } else if (unlikely(rcu_preempt_depth())) { + rcu_preempt_depth_set(1); } else { return; } -- cgit v1.2.1 From fc7113c3640f109c7dfb09fd1727edcd30cd7da2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Nov 2019 18:05:45 -0800 Subject: rcu: Avoid tick_dep_set_cpu() misordering In the current code, rcu_nmi_enter_common() might decide to turn on the tick using tick_dep_set_cpu(), but be delayed just before doing so. Then the grace-period kthread might notice that the CPU in question had in fact gone through a quiescent state, thus turning off the tick using tick_dep_clear_cpu(). The later invocation of tick_dep_set_cpu() would then incorrectly leave the tick on. This commit therefore enlists the aid of the leaf rcu_node structure's ->lock to ensure that decisions to enable or disable the tick are carried out before they can be reversed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5445da2326a0..b0e0612392a9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -800,8 +800,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,8 +828,13 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, @@ -898,6 +903,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { -- cgit v1.2.1 From c493f1c9c4094afae7f3a0693ae395f3859022b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 6 Oct 2019 14:33:22 -0700 Subject: torture: Use gawk instead of awk for systime() function In many environments, gawk provides systime(), but awk doesn't. This commit therefore changes awk scripts using systime() to instead be gawk scripts. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 4 ++-- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index dc49a3ba6111..86a217b41b9f 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -23,12 +23,12 @@ spinmax=${4-1000} n=1 -starttime=`awk 'BEGIN { print systime(); }' < /dev/null` +starttime=`gawk 'BEGIN { print systime(); }' < /dev/null` while : do # Check for done. - t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` + t=`gawk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` if test "$t" -gt "$duration" then exit 0; diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 33c669619736..1d98992d1c34 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -123,7 +123,7 @@ qemu_args=$5 boot_args=$6 cd $KVM -kstarttime=`awk 'BEGIN { print systime() }' < /dev/null` +kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` if test -z "$TORTURE_BUILDONLY" then echo ' ---' `date`: Starting kernel @@ -177,7 +177,7 @@ do then qemu_pid=`cat "$resdir/qemu_pid"` fi - kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then if test $kruntime -ge $seconds @@ -213,7 +213,7 @@ then oldline="`tail $resdir/console.log`" while : do - kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if kill -0 $qemu_pid > /dev/null 2>&1 then : -- cgit v1.2.1 From 9aa55ec206a6841e297c9df7e737b3d57f048a82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 Oct 2019 15:29:02 -0700 Subject: rcutorture: Dispense with Dracut for initrd creation The dracut scripting does not work on all platforms, and there are no known failures from the init binary based on the statically linked C program. This commit therefore removes the dracut scripting so that the statically linked C program is always used to create the init "script". Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 55 ++-------------------- 1 file changed, 3 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index 6fa9bd1ddc09..38e424d2392c 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -20,58 +20,9 @@ if [ -s "$D/initrd/init" ]; then exit 0 fi -T=${TMPDIR-/tmp}/mkinitrd.sh.$$ -trap 'rm -rf $T' 0 2 -mkdir $T - -cat > $T/init << '__EOF___' -#!/bin/sh -# Run in userspace a few milliseconds every second. This helps to -# exercise the NO_HZ_FULL portions of RCU. The 192 instances of "a" was -# empirically shown to give a nice multi-millisecond burst of user-mode -# execution on a 2GHz CPU, as desired. Modern CPUs will vary from a -# couple of milliseconds up to perhaps 100 milliseconds, which is an -# acceptable range. -# -# Why not calibrate an exact delay? Because within this initrd, we -# are restricted to Bourne-shell builtins, which as far as I know do not -# provide any means of obtaining a fine-grained timestamp. - -a4="a a a a" -a16="$a4 $a4 $a4 $a4" -a64="$a16 $a16 $a16 $a16" -a192="$a64 $a64 $a64" -while : -do - q= - for i in $a192 - do - q="$q $i" - done - sleep 1 -done -__EOF___ - -# Try using dracut to create initrd -if command -v dracut >/dev/null 2>&1 -then - echo Creating $D/initrd using dracut. - # Filesystem creation - dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img - cd $D - mkdir -p initrd - cd initrd - zcat $T/initramfs.img | cpio -id - cp $T/init init - chmod +x init - echo Done creating $D/initrd using dracut - exit 0 -fi - -# No dracut, so create a C-language initrd/init program and statically -# link it. This results in a very small initrd, but might be a bit less -# future-proof than dracut. -echo "Could not find dracut, attempting C initrd" +# Create a C-language initrd/init infinite-loop program and statically +# link it. This results in a very small initrd. +echo "Creating a statically linked C-language initrd" cd $D mkdir -p initrd cd initrd -- cgit v1.2.1 From 517f17aed0ce678dfa82d7dd4e2593fc1bac799c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 Oct 2019 07:05:38 -0700 Subject: torture: Handle jitter for CPUs that cannot be offlined Currently, jitter.sh assumes that the underlying hypervisor will be configured with all CPUs hotpluggable, with the possible exception of CPU 0. However, there are installations where the hypervisor prohibits offlining, which breaks jitter.sh. This commit therefore lists the CPUs that cannot be offlined up front, and checks for the case where no CPU can be offlined in the loop. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 26 ++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 86a217b41b9f..30cb5b27d32e 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -25,6 +25,18 @@ n=1 starttime=`gawk 'BEGIN { print systime(); }' < /dev/null` +nohotplugcpus= +for i in /sys/devices/system/cpu/cpu[0-9]* +do + if test -f $i/online + then + : + else + curcpu=`echo $i | sed -e 's/^[^0-9]*//'` + nohotplugcpus="$nohotplugcpus $curcpu" + fi +done + while : do # Check for done. @@ -35,13 +47,15 @@ do fi # Set affinity to randomly selected online CPU - cpus=`grep 1 /sys/devices/system/cpu/*/online | - sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'` - - # Do not leave out poor old cpu0 which may not be hot-pluggable - if [ ! -f "/sys/devices/system/cpu/cpu0/online" ]; then - cpus="0 $cpus" + if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 | + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'` + then + : + else + cpus= fi + # Do not leave out non-hot-pluggable CPUs + cpus="$cpus $nohotplugcpus" cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { srand(n + me + systime()); -- cgit v1.2.1 From b8dfff975c370912c7ac633ca3e4a812dcd38f96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Oct 2019 08:38:00 -0700 Subject: torture: Handle systems lacking the mpstat command The rcutorture scripting uses the mpstat command to determine how much the system is being used, and adjusts make's -j argument accordingly. However, mpstat isn't installed by default, so it would be good if the scripting does something useful when mpstat isn't present. This commit therefore makes the scripts assumes that if mpstat is not present, they are free to use all the CPUs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/cpus2use.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index 4e9485590c10..1dbfb62567d2 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -15,8 +15,15 @@ then exit 0 fi ncpus=`grep '^processor' /proc/cpuinfo | wc -l` -idlecpus=`mpstat | tail -1 | \ - awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` +if mpstat -V > /dev/null 2>&1 +then + idlecpus=`mpstat | tail -1 | \ + awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` +else + # No mpstat command, so use all available CPUs. + echo The mpstat command is not available, so greedily using all CPUs. + idlecpus=$ncpus +fi awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null ' BEGIN { cpus2use = idlecpus; -- cgit v1.2.1 From ebfbaa8dcc84eff146928ed59a8bd4bff932318e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 12:02:12 -0800 Subject: rcutorture: Add worst-case call_rcu() forward-progress results This commit adds the worst-case results from any call_rcu() forward-progress tests to the rcutorture test-summary output. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 2a7f3f4756a7..9d9a41625dd9 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -25,6 +25,7 @@ stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null | tail -1 | sed -e 's/^\[[ 0-9.]*] //' | awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' | tr -d '\012\015'`" +fwdprog="`grep 'rcu_torture_fwd_prog_cr Duration' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k15nr | head -1 | awk '{ print $14 " " $15 }'`" if test -z "$ngps" then echo "$configfile ------- " $stopstate @@ -39,7 +40,7 @@ else BEGIN { print ngps / dur }' < /dev/null` title="$title ($ngpsps/s)" fi - echo $title $stopstate + echo $title $stopstate $fwdprog nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'` if test -z "$nclosecalls" then -- cgit v1.2.1 From a289e608b3e740c15f623148c26cdec2d6698ce0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 08:31:56 -0800 Subject: rcutorture: Pull callback forward-progress data into rcu_fwd struct Now that RCU behaves reasonably well with the current single-kthread call_rcu() forward-progress testing, it is time to add more kthreads. This commit takes a first step towards that goal by wrapping what will be the per-kthread data into a new rcu_fwd structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 103 +++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..22a75a4b6b40 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1663,23 +1663,34 @@ struct rcu_fwd_cb { struct rcu_fwd_cb *rfc_next; int rfc_gps; }; -static DEFINE_SPINLOCK(rcu_fwd_lock); -static struct rcu_fwd_cb *rcu_fwd_cb_head; -static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; -static long n_launders_cb; -static unsigned long rcu_fwd_startat; -static bool rcu_fwd_emergency_stop; + #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) + struct rcu_launder_hist { long n_launders; unsigned long launder_gp_seq; }; -#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) -static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; -static unsigned long rcu_launder_gp_seq_start; + +struct rcu_fwd { + spinlock_t rcu_fwd_lock; + struct rcu_fwd_cb *rcu_fwd_cb_head; + struct rcu_fwd_cb **rcu_fwd_cb_tail; + long n_launders_cb; + unsigned long rcu_fwd_startat; + struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; + unsigned long rcu_launder_gp_seq_start; +}; + +struct rcu_fwd rcu_fwds = { + .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), + .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, +}; + +bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(void) { @@ -1688,16 +1699,17 @@ static void rcu_torture_fwd_cb_hist(void) int i; int j; - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) + if (rcu_fwds.n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwd_startat); - gps_old = rcu_launder_gp_seq_start; + __func__, jiffies - rcu_fwds.rcu_fwd_startat); + gps_old = rcu_fwds.rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = n_launders_hist[j].launder_gp_seq; + gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + j + 1, FWD_CBS_HIST_DIV, + rcu_fwds.n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1714,17 +1726,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcpp = rcu_fwd_cb_tail; - rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcpp = rcu_fwds.rcu_fwd_cb_tail; + rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(n_launders_hist)) - i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i].n_launders++; - n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); + i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) + i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; + rcu_fwds.n_launders_hist[i].n_launders++; + rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1751,16 +1763,16 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcp = rcu_fwd_cb_head; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcp = rcu_fwds.rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); break; } - rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwd_cb_head) - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwds.rcu_fwd_cb_head) + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1804,8 +1816,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + dur; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1864,23 +1876,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - n_launders_cb = 0; + rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) + rcu_fwds.n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_launder_gp_seq_start = gps; + rcu_fwds.rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1888,7 +1900,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwd_cb_head = rfcpn; + rcu_fwds.rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1910,7 +1922,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); + n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1921,7 +1933,8 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwd_startat, jiffies - stoppedat, + stoppedat - rcu_fwds.rcu_fwd_startat, + jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); @@ -1943,7 +1956,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", -- cgit v1.2.1 From 6b1b832546067caac8c5833abf88fa082d253b2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 09:08:58 -0800 Subject: rcutorture: Thread rcu_fwd pointer through forward-progress functions In order to add multiple kthreads, it will be necessary to allow the various functions to operate on a pointer to their kthread's rcu_fwd structure. This commit therefore starts the process of adding the needed "struct rcu_fwd" parameters and arguments to the various callback forward-progress functions. Note that rcutorture_oom_notify() and rcu_torture_fwd_cb_hist() will eventually need to iterate over all kthreads' rcu_fwd structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 78 ++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22a75a4b6b40..cc88ce910a6d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1661,6 +1661,7 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) struct rcu_fwd_cb { struct rcu_head rh; struct rcu_fwd_cb *rfc_next; + struct rcu_fwd *rfc_rfp; int rfc_gps; }; @@ -1692,24 +1693,24 @@ struct rcu_fwd rcu_fwds = { bool rcu_fwd_emergency_stop; -static void rcu_torture_fwd_cb_hist(void) +static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { unsigned long gps; unsigned long gps_old; int i; int j; - for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) - if (rcu_fwds.n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) + if (rfp->n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwds.rcu_fwd_startat); - gps_old = rcu_fwds.rcu_launder_gp_seq_start; + __func__, jiffies - rfp->rcu_fwd_startat); + gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; + gps = rfp->n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", j + 1, FWD_CBS_HIST_DIV, - rcu_fwds.n_launders_hist[j].n_launders, + rfp->n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1723,20 +1724,21 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; + struct rcu_fwd *rfp = rfcp->rfc_rfp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcpp = rcu_fwds.rcu_fwd_cb_tail; - rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcpp = rfp->rcu_fwd_cb_tail; + rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); - i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) - i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; - rcu_fwds.n_launders_hist[i].n_launders++; - rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); + i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rfp->n_launders_hist)) + i = ARRAY_SIZE(rfp->n_launders_hist) - 1; + rfp->n_launders_hist[i].n_launders++; + rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1786,7 +1788,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) } /* Carry out need_resched()/cond_resched() forward-progress testing. */ -static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) +static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, + int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; @@ -1816,8 +1819,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + dur; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1852,7 +1855,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Carry out call_rcu() forward-progress testing. */ -static void rcu_torture_fwd_prog_cr(void) +static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) { unsigned long cver; unsigned long flags; @@ -1876,23 +1879,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread + rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) - rcu_fwds.n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++) + rfp->n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_fwds.rcu_launder_gp_seq_start = gps; + rfp->rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); + rfcp = READ_ONCE(rfp->rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1900,7 +1903,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwds.rcu_fwd_cb_head = rfcpn; + rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1912,6 +1915,7 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs++; n_launders_sa = 0; rfcp->rfc_gps = 0; + rfcp->rfc_rfp = rfp; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); @@ -1922,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); + n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1933,12 +1937,11 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwds.rcu_fwd_startat, - jiffies - stoppedat, + stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); @@ -1955,7 +1958,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, { WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(&rcu_fwds); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ @@ -1980,6 +1983,7 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -1991,8 +1995,8 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(&tested, &tested_tries); - rcu_torture_fwd_prog_cr(); + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2027,7 +2031,7 @@ static int __init rcu_torture_fwd_prog_init(void) if (fwd_progress_div <= 0) fwd_progress_div = 4; return torture_create_kthread(rcu_torture_fwd_prog, - NULL, fwd_prog_task); + &rcu_fwds, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit v1.2.1 From 7beba0c06b588c725962bcc0273a489d46e81ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 07:49:31 -0800 Subject: rcutorture: Move to dynamic initialization of rcu_fwds In order to add multiple call_rcu() forward-progress kthreads, it will be necessary to dynamically allocate and initialize. This commit therefore moves the initialization from compile time to instead immediately precede thread-creation time. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cc88ce910a6d..6f540fed942c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,11 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds = { - .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), - .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, -}; - +struct rcu_fwd rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2026,6 +2022,8 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } + spin_lock_init(&rcu_fwds.rcu_fwd_lock); + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) -- cgit v1.2.1 From 6764100bd2927060aae91b40fce015f39fc4fd87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:20:20 -0800 Subject: rcutorture: Complete threading rcu_fwd pointers through functions This commit threads pointers to rcu_fwd structures through the remaining functions using rcu_fwds directly, namely rcu_torture_fwd_prog_cbfree(), rcutorture_oom_notify() and rcu_torture_fwd_prog_init(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6f540fed942c..394baac98ae0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1754,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. */ -static unsigned long rcu_torture_fwd_prog_cbfree(void) +static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) { unsigned long flags; unsigned long freed = 0; struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcp = rcu_fwds.rcu_fwd_cb_head; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcp = rfp->rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); break; } - rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwds.rcu_fwd_cb_head) - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + rfp->rcu_fwd_cb_head = rfcp->rfc_next; + if (!rfp->rcu_fwd_cb_head) + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1926,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - (void)rcu_torture_fwd_prog_cbfree(); + (void)rcu_torture_fwd_prog_cbfree(rfp); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { @@ -1952,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + struct rcu_fwd *rfp = &rcu_fwds; + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(&rcu_fwds); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); + rcu_torture_fwd_cb_hist(rfp); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2008,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + struct rcu_fwd *rfp = &rcu_fwds; + if (!fwd_progress) return 0; /* Not requested, so don't do it. */ if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || @@ -2022,14 +2026,13 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rcu_fwds.rcu_fwd_lock); - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - return torture_create_kthread(rcu_torture_fwd_prog, - &rcu_fwds, fwd_prog_task); + return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit v1.2.1 From 5155be9994e557618a8312389fb4e52dfbf28a3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:35:08 -0800 Subject: rcutorture: Dynamically allocate rcu_fwds structure This commit switches from static structure to dynamic allocation for rcu_fwds as another step towards providing multiple call_rcu() forward-progress kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 394baac98ae0..f77f4d886cc1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,7 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds; +struct rcu_fwd *rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -1952,7 +1952,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp = rcu_fwds; WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); @@ -2010,7 +2010,7 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ @@ -2026,12 +2026,15 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; + rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); + if (!rfp) + return -ENOMEM; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } -- cgit v1.2.1 From 25b4da74a955bf956428ab29e54aadf4fffab0a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Nov 2019 06:14:21 -0800 Subject: torture: Allow "CFLIST" to specify default list of scenarios On a large system, it can be convenient to tell rcutorture to run several instances of the default scenarios. Currently, this requires explicitly listing them, for example, "--configs '2*SRCU-N 2*SRCU-P...'". Although this works, it is rather inconvenient. This commit therefore allows "CFLIST" to be specified to indicate the default list of scenarios called out in the relevant CFLIST file, for example, for RCU, tools/testing/selftests/rcutorture/configs/rcu/CFLIST. In addition, multipliers may be used to run multiple instances of all the scenarios. For example, on a 256-CPU system, "--configs '3*CFLIST'" would run three instances of each scenario concurrently with one CPU left over. Thus "--configs '3*CFLIST TINY01'" would exactly consume all 256 CPUs, which makes rcutorture's jitter feature more effective. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 72518580df23..e19151c6e5e5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -198,9 +198,10 @@ fi CONFIGFRAG=${KVM}/configs/${TORTURE_SUITE}; export CONFIGFRAG +defaultconfigs="`tr '\012' ' ' < $CONFIGFRAG/CFLIST`" if test -z "$configs" then - configs="`cat $CONFIGFRAG/CFLIST`" + configs=$defaultconfigs fi if test -z "$resdir" @@ -209,7 +210,7 @@ then fi # Create a file of test-name/#cpus pairs, sorted by decreasing #cpus. -touch $T/cfgcpu +configs_derep= for CF in $configs do case $CF in @@ -222,15 +223,21 @@ do CF1=$CF ;; esac + for ((cur_rep=0;cur_rep<$config_reps;cur_rep++)) + do + configs_derep="$configs_derep $CF1" + done +done +touch $T/cfgcpu +configs_derep="`echo $configs_derep | sed -e "s/\/$defaultconfigs/g"`" +for CF1 in $configs_derep +do if test -f "$CONFIGFRAG/$CF1" then cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1` cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"` cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"` - for ((cur_rep=0;cur_rep<$config_reps;cur_rep++)) - do - echo $CF1 $cpu_count >> $T/cfgcpu - done + echo $CF1 $cpu_count >> $T/cfgcpu else echo "The --configs file $CF1 does not exist, terminating." exit 1 -- cgit v1.2.1 From b22eb7cefb9d31cf862542f9cef90f97c0294842 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Nov 2019 14:33:28 -0800 Subject: torture: Hoist calls to lscpu to higher-level kvm.sh script On some kernels, concurrent calls to the lscpu command result in severe slowdowns. For example, on v4.16, a single lscpu invocation takes about two milliseconds, four concurrent invocations more than two seconds, and 16 concurrent invocations more than 20 seconds. Given that the only goal is to learn the number of CPUs, invoking lscpu but once suffices. This commit therefore invokes lscpu early in kvm.sh execution, setting the initial value of the TORTURE_ALLOTED_CPUS environment variable. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 7 +++---- tools/testing/selftests/rcutorture/bin/kvm.sh | 11 ++++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 1d98992d1c34..e0352304b98b 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -133,11 +133,10 @@ fi qemu_args="-enable-kvm -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment` cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` -vcpus=`identify_qemu_vcpus` -if test $cpu_count -gt $vcpus +if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS" then - echo CPU count limited from $cpu_count to $vcpus | tee -a $resdir/Warnings - cpu_count=$vcpus + echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings + cpu_count=$TORTURE_ALLOTED_CPUS fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index e19151c6e5e5..78d18ab8e954 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -24,7 +24,9 @@ dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH -TORTURE_ALLOTED_CPUS="" +. functions.sh + +TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD @@ -40,8 +42,6 @@ cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` jitter="-1" -. functions.sh - usage () { echo "Usage: $scriptname optional arguments:" echo " --bootargs kernel-boot-arguments" @@ -93,6 +93,11 @@ do checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--' cpus=$2 TORTURE_ALLOTED_CPUS="$2" + max_cpus="`identify_qemu_vcpus`" + if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus" + then + TORTURE_ALLOTED_CPUS=$max_cpus + fi shift ;; --datestamp) -- cgit v1.2.1 From 9ffdd7982417e2e227e295c4dea9cec652a71983 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 29 Oct 2019 01:54:17 +0530 Subject: doc: Convert arrayRCU.txt to arrayRCU.rst This patch converts arrayRCU from .txt to .rst format, and also adds it to the index.rst file. Signed-off-by: Madhuparna Bhowmik [ paulmck: Trimmed trailing whitespace. ] Tested-by: Phong Tran Tested-by: Amol Grover Signed-off-by: Paul E. McKenney --- Documentation/RCU/arrayRCU.rst | 165 +++++++++++++++++++++++++++++++++++++++++ Documentation/RCU/arrayRCU.txt | 153 -------------------------------------- Documentation/RCU/index.rst | 1 + 3 files changed, 166 insertions(+), 153 deletions(-) create mode 100644 Documentation/RCU/arrayRCU.rst delete mode 100644 Documentation/RCU/arrayRCU.txt diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst new file mode 100644 index 000000000000..4051ea3871ef --- /dev/null +++ b/Documentation/RCU/arrayRCU.rst @@ -0,0 +1,165 @@ +.. _array_rcu_doc: + +Using RCU to Protect Read-Mostly Arrays +======================================= + +Although RCU is more commonly used to protect linked lists, it can +also be used to protect arrays. Three situations are as follows: + +1. :ref:`Hash Tables ` + +2. :ref:`Static Arrays ` + +3. :ref:`Resizable Arrays ` + +Each of these three situations involves an RCU-protected pointer to an +array that is separately indexed. It might be tempting to consider use +of RCU to instead protect the index into an array, however, this use +case is **not** supported. The problem with RCU-protected indexes into +arrays is that compilers can play way too many optimization games with +integers, which means that the rules governing handling of these indexes +are far more trouble than they are worth. If RCU-protected indexes into +arrays prove to be particularly valuable (which they have not thus far), +explicit cooperation from the compiler will be required to permit them +to be safely used. + +That aside, each of the three RCU-protected pointer situations are +described in the following sections. + +.. _hash_tables: + +Situation 1: Hash Tables +------------------------ + +Hash tables are often implemented as an array, where each array entry +has a linked-list hash chain. Each hash chain can be protected by RCU +as described in the listRCU.txt document. This approach also applies +to other array-of-list situations, such as radix trees. + +.. _static_arrays: + +Situation 2: Static Arrays +-------------------------- + +Static arrays, where the data (rather than a pointer to the data) is +located in each array element, and where the array is never resized, +have not been used with RCU. Rik van Riel recommends using seqlock in +this situation, which would also have minimal read-side overhead as long +as updates are rare. + +Quick Quiz: + Why is it so important that updates be rare when using seqlock? + +:ref:`Answer to Quick Quiz ` + +.. _resizable_arrays: + +Situation 3: Resizable Arrays +------------------------------ + +Use of RCU for resizable arrays is demonstrated by the grow_ary() +function formerly used by the System V IPC code. The array is used +to map from semaphore, message-queue, and shared-memory IDs to the data +structure that represents the corresponding IPC construct. The grow_ary() +function does not acquire any locks; instead its caller must hold the +ids->sem semaphore. + +The grow_ary() function, shown below, does some limit checks, allocates a +new ipc_id_ary, copies the old to the new portion of the new, initializes +the remainder of the new, updates the ids->entries pointer to point to +the new array, and invokes ipc_rcu_putref() to free up the old array. +Note that rcu_assign_pointer() is used to update the ids->entries pointer, +which includes any memory barriers required on whatever architecture +you are running on:: + + static int grow_ary(struct ipc_ids* ids, int newsize) + { + struct ipc_id_ary* new; + struct ipc_id_ary* old; + int i; + int size = ids->entries->size; + + if(newsize > IPCMNI) + newsize = IPCMNI; + if(newsize <= size) + return newsize; + + new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize + + sizeof(struct ipc_id_ary)); + if(new == NULL) + return size; + new->size = newsize; + memcpy(new->p, ids->entries->p, + sizeof(struct kern_ipc_perm *)*size + + sizeof(struct ipc_id_ary)); + for(i=size;ip[i] = NULL; + } + old = ids->entries; + + /* + * Use rcu_assign_pointer() to make sure the memcpyed + * contents of the new array are visible before the new + * array becomes visible. + */ + rcu_assign_pointer(ids->entries, new); + + ipc_rcu_putref(old); + return newsize; + } + +The ipc_rcu_putref() function decrements the array's reference count +and then, if the reference count has dropped to zero, uses call_rcu() +to free the array after a grace period has elapsed. + +The array is traversed by the ipc_lock() function. This function +indexes into the array under the protection of rcu_read_lock(), +using rcu_dereference() to pick up the pointer to the array so +that it may later safely be dereferenced -- memory barriers are +required on the Alpha CPU. Since the size of the array is stored +with the array itself, there can be no array-size mismatches, so +a simple check suffices. The pointer to the structure corresponding +to the desired IPC object is placed in "out", with NULL indicating +a non-existent entry. After acquiring "out->lock", the "out->deleted" +flag indicates whether the IPC object is in the process of being +deleted, and, if not, the pointer is returned:: + + struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) + { + struct kern_ipc_perm* out; + int lid = id % SEQ_MULTIPLIER; + struct ipc_id_ary* entries; + + rcu_read_lock(); + entries = rcu_dereference(ids->entries); + if(lid >= entries->size) { + rcu_read_unlock(); + return NULL; + } + out = entries->p[lid]; + if(out == NULL) { + rcu_read_unlock(); + return NULL; + } + spin_lock(&out->lock); + + /* ipc_rmid() may have already freed the ID while ipc_lock + * was spinning: here verify that the structure is still valid + */ + if (out->deleted) { + spin_unlock(&out->lock); + rcu_read_unlock(); + return NULL; + } + return out; + } + +.. _answer_quick_quiz_seqlock: + +Answer to Quick Quiz: + Why is it so important that updates be rare when using seqlock? + + The reason that it is important that updates be rare when + using seqlock is that frequent updates can livelock readers. + One way to avoid this problem is to assign a seqlock for + each array entry rather than to the entire array. diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt deleted file mode 100644 index f05a9afb2c39..000000000000 --- a/Documentation/RCU/arrayRCU.txt +++ /dev/null @@ -1,153 +0,0 @@ -Using RCU to Protect Read-Mostly Arrays - - -Although RCU is more commonly used to protect linked lists, it can -also be used to protect arrays. Three situations are as follows: - -1. Hash Tables - -2. Static Arrays - -3. Resizeable Arrays - -Each of these three situations involves an RCU-protected pointer to an -array that is separately indexed. It might be tempting to consider use -of RCU to instead protect the index into an array, however, this use -case is -not- supported. The problem with RCU-protected indexes into -arrays is that compilers can play way too many optimization games with -integers, which means that the rules governing handling of these indexes -are far more trouble than they are worth. If RCU-protected indexes into -arrays prove to be particularly valuable (which they have not thus far), -explicit cooperation from the compiler will be required to permit them -to be safely used. - -That aside, each of the three RCU-protected pointer situations are -described in the following sections. - - -Situation 1: Hash Tables - -Hash tables are often implemented as an array, where each array entry -has a linked-list hash chain. Each hash chain can be protected by RCU -as described in the listRCU.txt document. This approach also applies -to other array-of-list situations, such as radix trees. - - -Situation 2: Static Arrays - -Static arrays, where the data (rather than a pointer to the data) is -located in each array element, and where the array is never resized, -have not been used with RCU. Rik van Riel recommends using seqlock in -this situation, which would also have minimal read-side overhead as long -as updates are rare. - -Quick Quiz: Why is it so important that updates be rare when - using seqlock? - - -Situation 3: Resizeable Arrays - -Use of RCU for resizeable arrays is demonstrated by the grow_ary() -function formerly used by the System V IPC code. The array is used -to map from semaphore, message-queue, and shared-memory IDs to the data -structure that represents the corresponding IPC construct. The grow_ary() -function does not acquire any locks; instead its caller must hold the -ids->sem semaphore. - -The grow_ary() function, shown below, does some limit checks, allocates a -new ipc_id_ary, copies the old to the new portion of the new, initializes -the remainder of the new, updates the ids->entries pointer to point to -the new array, and invokes ipc_rcu_putref() to free up the old array. -Note that rcu_assign_pointer() is used to update the ids->entries pointer, -which includes any memory barriers required on whatever architecture -you are running on. - - static int grow_ary(struct ipc_ids* ids, int newsize) - { - struct ipc_id_ary* new; - struct ipc_id_ary* old; - int i; - int size = ids->entries->size; - - if(newsize > IPCMNI) - newsize = IPCMNI; - if(newsize <= size) - return newsize; - - new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize + - sizeof(struct ipc_id_ary)); - if(new == NULL) - return size; - new->size = newsize; - memcpy(new->p, ids->entries->p, - sizeof(struct kern_ipc_perm *)*size + - sizeof(struct ipc_id_ary)); - for(i=size;ip[i] = NULL; - } - old = ids->entries; - - /* - * Use rcu_assign_pointer() to make sure the memcpyed - * contents of the new array are visible before the new - * array becomes visible. - */ - rcu_assign_pointer(ids->entries, new); - - ipc_rcu_putref(old); - return newsize; - } - -The ipc_rcu_putref() function decrements the array's reference count -and then, if the reference count has dropped to zero, uses call_rcu() -to free the array after a grace period has elapsed. - -The array is traversed by the ipc_lock() function. This function -indexes into the array under the protection of rcu_read_lock(), -using rcu_dereference() to pick up the pointer to the array so -that it may later safely be dereferenced -- memory barriers are -required on the Alpha CPU. Since the size of the array is stored -with the array itself, there can be no array-size mismatches, so -a simple check suffices. The pointer to the structure corresponding -to the desired IPC object is placed in "out", with NULL indicating -a non-existent entry. After acquiring "out->lock", the "out->deleted" -flag indicates whether the IPC object is in the process of being -deleted, and, if not, the pointer is returned. - - struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) - { - struct kern_ipc_perm* out; - int lid = id % SEQ_MULTIPLIER; - struct ipc_id_ary* entries; - - rcu_read_lock(); - entries = rcu_dereference(ids->entries); - if(lid >= entries->size) { - rcu_read_unlock(); - return NULL; - } - out = entries->p[lid]; - if(out == NULL) { - rcu_read_unlock(); - return NULL; - } - spin_lock(&out->lock); - - /* ipc_rmid() may have already freed the ID while ipc_lock - * was spinning: here verify that the structure is still valid - */ - if (out->deleted) { - spin_unlock(&out->lock); - rcu_read_unlock(); - return NULL; - } - return out; - } - - -Answer to Quick Quiz: - - The reason that it is important that updates be rare when - using seqlock is that frequent updates can livelock readers. - One way to avoid this problem is to assign a seqlock for - each array entry rather than to the entire array. diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 5c99185710fa..8d20d44f8fd4 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -7,6 +7,7 @@ RCU concepts .. toctree:: :maxdepth: 3 + arrayRCU rcu listRCU UP -- cgit v1.2.1 From 6705cae433cffc37b183ded6ca9fe5c6d8ae8a9d Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 29 Oct 2019 03:12:52 +0530 Subject: doc: Converted NMI-RCU.txt to NMI-RCU.rst. This patch converts NMI-RCU from txt to rst format. Also adds NMI-RCU in the index.rst file. Signed-off-by: Madhuparna Bhowmik [ paulmck: Apply feedback from Phong Tran. ] Tested-by: Phong Tran Signed-off-by: Paul E. McKenney --- Documentation/RCU/NMI-RCU.rst | 124 ++++++++++++++++++++++++++++++++++++++++++ Documentation/RCU/NMI-RCU.txt | 121 ----------------------------------------- Documentation/RCU/index.rst | 1 + 3 files changed, 125 insertions(+), 121 deletions(-) create mode 100644 Documentation/RCU/NMI-RCU.rst delete mode 100644 Documentation/RCU/NMI-RCU.txt diff --git a/Documentation/RCU/NMI-RCU.rst b/Documentation/RCU/NMI-RCU.rst new file mode 100644 index 000000000000..180958388ff9 --- /dev/null +++ b/Documentation/RCU/NMI-RCU.rst @@ -0,0 +1,124 @@ +.. _NMI_rcu_doc: + +Using RCU to Protect Dynamic NMI Handlers +========================================= + + +Although RCU is usually used to protect read-mostly data structures, +it is possible to use RCU to provide dynamic non-maskable interrupt +handlers, as well as dynamic irq handlers. This document describes +how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer +work in "arch/x86/oprofile/nmi_timer_int.c" and in +"arch/x86/kernel/traps.c". + +The relevant pieces of code are listed below, each followed by a +brief explanation:: + + static int dummy_nmi_callback(struct pt_regs *regs, int cpu) + { + return 0; + } + +The dummy_nmi_callback() function is a "dummy" NMI handler that does +nothing, but returns zero, thus saying that it did nothing, allowing +the NMI handler to take the default machine-specific action:: + + static nmi_callback_t nmi_callback = dummy_nmi_callback; + +This nmi_callback variable is a global function pointer to the current +NMI handler:: + + void do_nmi(struct pt_regs * regs, long error_code) + { + int cpu; + + nmi_enter(); + + cpu = smp_processor_id(); + ++nmi_count(cpu); + + if (!rcu_dereference_sched(nmi_callback)(regs, cpu)) + default_do_nmi(regs); + + nmi_exit(); + } + +The do_nmi() function processes each NMI. It first disables preemption +in the same way that a hardware irq would, then increments the per-CPU +count of NMIs. It then invokes the NMI handler stored in the nmi_callback +function pointer. If this handler returns zero, do_nmi() invokes the +default_do_nmi() function to handle a machine-specific NMI. Finally, +preemption is restored. + +In theory, rcu_dereference_sched() is not needed, since this code runs +only on i386, which in theory does not need rcu_dereference_sched() +anyway. However, in practice it is a good documentation aid, particularly +for anyone attempting to do something similar on Alpha or on systems +with aggressive optimizing compilers. + +Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? + +:ref:`Answer to Quick Quiz ` + +Back to the discussion of NMI and RCU:: + + void set_nmi_callback(nmi_callback_t callback) + { + rcu_assign_pointer(nmi_callback, callback); + } + +The set_nmi_callback() function registers an NMI handler. Note that any +data that is to be used by the callback must be initialized up -before- +the call to set_nmi_callback(). On architectures that do not order +writes, the rcu_assign_pointer() ensures that the NMI handler sees the +initialized values:: + + void unset_nmi_callback(void) + { + rcu_assign_pointer(nmi_callback, dummy_nmi_callback); + } + +This function unregisters an NMI handler, restoring the original +dummy_nmi_handler(). However, there may well be an NMI handler +currently executing on some other CPU. We therefore cannot free +up any data structures used by the old NMI handler until execution +of it completes on all other CPUs. + +One way to accomplish this is via synchronize_rcu(), perhaps as +follows:: + + unset_nmi_callback(); + synchronize_rcu(); + kfree(my_nmi_data); + +This works because (as of v4.20) synchronize_rcu() blocks until all +CPUs complete any preemption-disabled segments of code that they were +executing. +Since NMI handlers disable preemption, synchronize_rcu() is guaranteed +not to return until all ongoing NMI handlers exit. It is therefore safe +to free up the handler's data as soon as synchronize_rcu() returns. + +Important note: for this to work, the architecture in question must +invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. + +.. _answer_quick_quiz_NMI: + +Answer to Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? + + The caller to set_nmi_callback() might well have + initialized some data that is to be used by the new NMI + handler. In this case, the rcu_dereference_sched() would + be needed, because otherwise a CPU that received an NMI + just after the new handler was set might see the pointer + to the new NMI handler, but the old pre-initialized + version of the handler's data. + + This same sad story can happen on other CPUs when using + a compiler with aggressive pointer-value speculation + optimizations. + + More important, the rcu_dereference_sched() makes it + clear to someone reading the code that the pointer is + being protected by RCU-sched. diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt deleted file mode 100644 index 881353fd5bff..000000000000 --- a/Documentation/RCU/NMI-RCU.txt +++ /dev/null @@ -1,121 +0,0 @@ -Using RCU to Protect Dynamic NMI Handlers - - -Although RCU is usually used to protect read-mostly data structures, -it is possible to use RCU to provide dynamic non-maskable interrupt -handlers, as well as dynamic irq handlers. This document describes -how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer -work in "arch/x86/oprofile/nmi_timer_int.c" and in -"arch/x86/kernel/traps.c". - -The relevant pieces of code are listed below, each followed by a -brief explanation. - - static int dummy_nmi_callback(struct pt_regs *regs, int cpu) - { - return 0; - } - -The dummy_nmi_callback() function is a "dummy" NMI handler that does -nothing, but returns zero, thus saying that it did nothing, allowing -the NMI handler to take the default machine-specific action. - - static nmi_callback_t nmi_callback = dummy_nmi_callback; - -This nmi_callback variable is a global function pointer to the current -NMI handler. - - void do_nmi(struct pt_regs * regs, long error_code) - { - int cpu; - - nmi_enter(); - - cpu = smp_processor_id(); - ++nmi_count(cpu); - - if (!rcu_dereference_sched(nmi_callback)(regs, cpu)) - default_do_nmi(regs); - - nmi_exit(); - } - -The do_nmi() function processes each NMI. It first disables preemption -in the same way that a hardware irq would, then increments the per-CPU -count of NMIs. It then invokes the NMI handler stored in the nmi_callback -function pointer. If this handler returns zero, do_nmi() invokes the -default_do_nmi() function to handle a machine-specific NMI. Finally, -preemption is restored. - -In theory, rcu_dereference_sched() is not needed, since this code runs -only on i386, which in theory does not need rcu_dereference_sched() -anyway. However, in practice it is a good documentation aid, particularly -for anyone attempting to do something similar on Alpha or on systems -with aggressive optimizing compilers. - -Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha, - given that the code referenced by the pointer is read-only? - - -Back to the discussion of NMI and RCU... - - void set_nmi_callback(nmi_callback_t callback) - { - rcu_assign_pointer(nmi_callback, callback); - } - -The set_nmi_callback() function registers an NMI handler. Note that any -data that is to be used by the callback must be initialized up -before- -the call to set_nmi_callback(). On architectures that do not order -writes, the rcu_assign_pointer() ensures that the NMI handler sees the -initialized values. - - void unset_nmi_callback(void) - { - rcu_assign_pointer(nmi_callback, dummy_nmi_callback); - } - -This function unregisters an NMI handler, restoring the original -dummy_nmi_handler(). However, there may well be an NMI handler -currently executing on some other CPU. We therefore cannot free -up any data structures used by the old NMI handler until execution -of it completes on all other CPUs. - -One way to accomplish this is via synchronize_rcu(), perhaps as -follows: - - unset_nmi_callback(); - synchronize_rcu(); - kfree(my_nmi_data); - -This works because (as of v4.20) synchronize_rcu() blocks until all -CPUs complete any preemption-disabled segments of code that they were -executing. -Since NMI handlers disable preemption, synchronize_rcu() is guaranteed -not to return until all ongoing NMI handlers exit. It is therefore safe -to free up the handler's data as soon as synchronize_rcu() returns. - -Important note: for this to work, the architecture in question must -invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. - - -Answer to Quick Quiz - - Why might the rcu_dereference_sched() be necessary on Alpha, given - that the code referenced by the pointer is read-only? - - Answer: The caller to set_nmi_callback() might well have - initialized some data that is to be used by the new NMI - handler. In this case, the rcu_dereference_sched() would - be needed, because otherwise a CPU that received an NMI - just after the new handler was set might see the pointer - to the new NMI handler, but the old pre-initialized - version of the handler's data. - - This same sad story can happen on other CPUs when using - a compiler with aggressive pointer-value speculation - optimizations. - - More important, the rcu_dereference_sched() makes it - clear to someone reading the code that the pointer is - being protected by RCU-sched. diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 8d20d44f8fd4..627128c230dc 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -10,6 +10,7 @@ RCU concepts arrayRCU rcu listRCU + NMI-RCU UP Design/Memory-Ordering/Tree-RCU-Memory-Ordering -- cgit v1.2.1 From 5e1bc932818f74082e8331b59aa550101ead08e0 Mon Sep 17 00:00:00 2001 From: Phong Tran Date: Wed, 6 Nov 2019 20:09:50 +0700 Subject: doc: Convert whatisRCU.txt to .rst This commit updates whatisRCU.txt to the new .rst format. This change includes: - Formatting bullet lists - Adding literal blocks - Links from table of contents to corresponding sections - Links to external documents - Reformat quick quizzes Signed-off-by: Phong Tran Tested-by: Madhuparna Bhowmik [ tranmanphong: Apply Amol Grover feedback. ] Reviewed-by: Amol Grover Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/whatisRCU.rst | 1149 +++++++++++++++++++++++++++++++++++++++ Documentation/RCU/whatisRCU.txt | 1079 ------------------------------------ 3 files changed, 1150 insertions(+), 1079 deletions(-) create mode 100644 Documentation/RCU/whatisRCU.rst delete mode 100644 Documentation/RCU/whatisRCU.txt diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 627128c230dc..b9b11481c727 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -8,6 +8,7 @@ RCU concepts :maxdepth: 3 arrayRCU + whatisRCU rcu listRCU NMI-RCU diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst new file mode 100644 index 000000000000..2f6f6ebbc8b0 --- /dev/null +++ b/Documentation/RCU/whatisRCU.rst @@ -0,0 +1,1149 @@ +.. _whatisrcu_doc: + +What is RCU? -- "Read, Copy, Update" +====================================== + +Please note that the "What is RCU?" LWN series is an excellent place +to start learning about RCU: + +| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ +| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ +| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ +| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ +| 2010 Big API Table http://lwn.net/Articles/419086/ +| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ +| 2014 Big API Table http://lwn.net/Articles/609973/ + + +What is RCU? + +RCU is a synchronization mechanism that was added to the Linux kernel +during the 2.5 development effort that is optimized for read-mostly +situations. Although RCU is actually quite simple once you understand it, +getting there can sometimes be a challenge. Part of the problem is that +most of the past descriptions of RCU have been written with the mistaken +assumption that there is "one true way" to describe RCU. Instead, +the experience has been that different people must take different paths +to arrive at an understanding of RCU. This document provides several +different paths, as follows: + +:ref:`1. RCU OVERVIEW <1_whatisRCU>` + +:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>` + +:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>` + +:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>` + +:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>` + +:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>` + +:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>` + +:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>` + +People who prefer starting with a conceptual overview should focus on +Section 1, though most readers will profit by reading this section at +some point. People who prefer to start with an API that they can then +experiment with should focus on Section 2. People who prefer to start +with example uses should focus on Sections 3 and 4. People who need to +understand the RCU implementation should focus on Section 5, then dive +into the kernel source code. People who reason best by analogy should +focus on Section 6. Section 7 serves as an index to the docbook API +documentation, and Section 8 is the traditional answer key. + +So, start with the section that makes the most sense to you and your +preferred method of learning. If you need to know everything about +everything, feel free to read the whole thing -- but if you are really +that type of person, you have perused the source code and will therefore +never need this document anyway. ;-) + +.. _1_whatisRCU: + +1. RCU OVERVIEW +---------------- + +The basic idea behind RCU is to split updates into "removal" and +"reclamation" phases. The removal phase removes references to data items +within a data structure (possibly by replacing them with references to +new versions of these data items), and can run concurrently with readers. +The reason that it is safe to run the removal phase concurrently with +readers is the semantics of modern CPUs guarantee that readers will see +either the old or the new version of the data structure rather than a +partially updated reference. The reclamation phase does the work of reclaiming +(e.g., freeing) the data items removed from the data structure during the +removal phase. Because reclaiming data items can disrupt any readers +concurrently referencing those data items, the reclamation phase must +not start until readers no longer hold references to those data items. + +Splitting the update into removal and reclamation phases permits the +updater to perform the removal phase immediately, and to defer the +reclamation phase until all readers active during the removal phase have +completed, either by blocking until they finish or by registering a +callback that is invoked after they finish. Only readers that are active +during the removal phase need be considered, because any reader starting +after the removal phase will be unable to gain a reference to the removed +data items, and therefore cannot be disrupted by the reclamation phase. + +So the typical RCU update sequence goes something like the following: + +a. Remove pointers to a data structure, so that subsequent + readers cannot gain a reference to it. + +b. Wait for all previous readers to complete their RCU read-side + critical sections. + +c. At this point, there cannot be any readers who hold references + to the data structure, so it now may safely be reclaimed + (e.g., kfree()d). + +Step (b) above is the key idea underlying RCU's deferred destruction. +The ability to wait until all readers are done allows RCU readers to +use much lighter-weight synchronization, in some cases, absolutely no +synchronization at all. In contrast, in more conventional lock-based +schemes, readers must use heavy-weight synchronization in order to +prevent an updater from deleting the data structure out from under them. +This is because lock-based updaters typically update data items in place, +and must therefore exclude readers. In contrast, RCU-based updaters +typically take advantage of the fact that writes to single aligned +pointers are atomic on modern CPUs, allowing atomic insertion, removal, +and replacement of data items in a linked structure without disrupting +readers. Concurrent RCU readers can then continue accessing the old +versions, and can dispense with the atomic operations, memory barriers, +and communications cache misses that are so expensive on present-day +SMP computer systems, even in absence of lock contention. + +In the three-step procedure shown above, the updater is performing both +the removal and the reclamation step, but it is often helpful for an +entirely different thread to do the reclamation, as is in fact the case +in the Linux kernel's directory-entry cache (dcache). Even if the same +thread performs both the update step (step (a) above) and the reclamation +step (step (c) above), it is often helpful to think of them separately. +For example, RCU readers and updaters need not communicate at all, +but RCU provides implicit low-overhead communication between readers +and reclaimers, namely, in step (b) above. + +So how the heck can a reclaimer tell when a reader is done, given +that readers are not doing any sort of synchronization operations??? +Read on to learn about how RCU's API makes this easy. + +.. _2_whatisRCU: + +2. WHAT IS RCU'S CORE API? +--------------------------- + +The core RCU API is quite small: + +a. rcu_read_lock() +b. rcu_read_unlock() +c. synchronize_rcu() / call_rcu() +d. rcu_assign_pointer() +e. rcu_dereference() + +There are many other members of the RCU API, but the rest can be +expressed in terms of these five, though most implementations instead +express synchronize_rcu() in terms of the call_rcu() callback API. + +The five core RCU APIs are described below, the other 18 will be enumerated +later. See the kernel docbook documentation for more info, or look directly +at the function header comments. + +rcu_read_lock() +^^^^^^^^^^^^^^^ + void rcu_read_lock(void); + + Used by a reader to inform the reclaimer that the reader is + entering an RCU read-side critical section. It is illegal + to block while in an RCU read-side critical section, though + kernels built with CONFIG_PREEMPT_RCU can preempt RCU + read-side critical sections. Any RCU-protected data structure + accessed during an RCU read-side critical section is guaranteed to + remain unreclaimed for the full duration of that critical section. + Reference counts may be used in conjunction with RCU to maintain + longer-term references to data structures. + +rcu_read_unlock() +^^^^^^^^^^^^^^^^^ + void rcu_read_unlock(void); + + Used by a reader to inform the reclaimer that the reader is + exiting an RCU read-side critical section. Note that RCU + read-side critical sections may be nested and/or overlapping. + +synchronize_rcu() +^^^^^^^^^^^^^^^^^ + void synchronize_rcu(void); + + Marks the end of updater code and the beginning of reclaimer + code. It does this by blocking until all pre-existing RCU + read-side critical sections on all CPUs have completed. + Note that synchronize_rcu() will **not** necessarily wait for + any subsequent RCU read-side critical sections to complete. + For example, consider the following sequence of events:: + + CPU 0 CPU 1 CPU 2 + ----------------- ------------------------- --------------- + 1. rcu_read_lock() + 2. enters synchronize_rcu() + 3. rcu_read_lock() + 4. rcu_read_unlock() + 5. exits synchronize_rcu() + 6. rcu_read_unlock() + + To reiterate, synchronize_rcu() waits only for ongoing RCU + read-side critical sections to complete, not necessarily for + any that begin after synchronize_rcu() is invoked. + + Of course, synchronize_rcu() does not necessarily return + **immediately** after the last pre-existing RCU read-side critical + section completes. For one thing, there might well be scheduling + delays. For another thing, many RCU implementations process + requests in batches in order to improve efficiencies, which can + further delay synchronize_rcu(). + + Since synchronize_rcu() is the API that must figure out when + readers are done, its implementation is key to RCU. For RCU + to be useful in all but the most read-intensive situations, + synchronize_rcu()'s overhead must also be quite small. + + The call_rcu() API is a callback form of synchronize_rcu(), + and is described in more detail in a later section. Instead of + blocking, it registers a function and argument which are invoked + after all ongoing RCU read-side critical sections have completed. + This callback variant is particularly useful in situations where + it is illegal to block or where update-side performance is + critically important. + + However, the call_rcu() API should not be used lightly, as use + of the synchronize_rcu() API generally results in simpler code. + In addition, the synchronize_rcu() API has the nice property + of automatically limiting update rate should grace periods + be delayed. This property results in system resilience in face + of denial-of-service attacks. Code using call_rcu() should limit + update rate in order to gain this same sort of resilience. See + checklist.txt for some approaches to limiting the update rate. + +rcu_assign_pointer() +^^^^^^^^^^^^^^^^^^^^ + void rcu_assign_pointer(p, typeof(p) v); + + Yes, rcu_assign_pointer() **is** implemented as a macro, though it + would be cool to be able to declare a function in this manner. + (Compiler experts will no doubt disagree.) + + The updater uses this function to assign a new value to an + RCU-protected pointer, in order to safely communicate the change + in value from the updater to the reader. This macro does not + evaluate to an rvalue, but it does execute any memory-barrier + instructions required for a given CPU architecture. + + Perhaps just as important, it serves to document (1) which + pointers are protected by RCU and (2) the point at which a + given structure becomes accessible to other CPUs. That said, + rcu_assign_pointer() is most frequently used indirectly, via + the _rcu list-manipulation primitives such as list_add_rcu(). + +rcu_dereference() +^^^^^^^^^^^^^^^^^ + typeof(p) rcu_dereference(p); + + Like rcu_assign_pointer(), rcu_dereference() must be implemented + as a macro. + + The reader uses rcu_dereference() to fetch an RCU-protected + pointer, which returns a value that may then be safely + dereferenced. Note that rcu_dereference() does not actually + dereference the pointer, instead, it protects the pointer for + later dereferencing. It also executes any needed memory-barrier + instructions for a given CPU architecture. Currently, only Alpha + needs memory barriers within rcu_dereference() -- on other CPUs, + it compiles to nothing, not even a compiler directive. + + Common coding practice uses rcu_dereference() to copy an + RCU-protected pointer to a local variable, then dereferences + this local variable, for example as follows:: + + p = rcu_dereference(head.next); + return p->data; + + However, in this case, one could just as easily combine these + into one statement:: + + return rcu_dereference(head.next)->data; + + If you are going to be fetching multiple fields from the + RCU-protected structure, using the local variable is of + course preferred. Repeated rcu_dereference() calls look + ugly, do not guarantee that the same pointer will be returned + if an update happened while in the critical section, and incur + unnecessary overhead on Alpha CPUs. + + Note that the value returned by rcu_dereference() is valid + only within the enclosing RCU read-side critical section [1]_. + For example, the following is **not** legal:: + + rcu_read_lock(); + p = rcu_dereference(head.next); + rcu_read_unlock(); + x = p->address; /* BUG!!! */ + rcu_read_lock(); + y = p->data; /* BUG!!! */ + rcu_read_unlock(); + + Holding a reference from one RCU read-side critical section + to another is just as illegal as holding a reference from + one lock-based critical section to another! Similarly, + using a reference outside of the critical section in which + it was acquired is just as illegal as doing so with normal + locking. + + As with rcu_assign_pointer(), an important function of + rcu_dereference() is to document which pointers are protected by + RCU, in particular, flagging a pointer that is subject to changing + at any time, including immediately after the rcu_dereference(). + And, again like rcu_assign_pointer(), rcu_dereference() is + typically used indirectly, via the _rcu list-manipulation + primitives, such as list_for_each_entry_rcu() [2]_. + +.. [1] The variant rcu_dereference_protected() can be used outside + of an RCU read-side critical section as long as the usage is + protected by locks acquired by the update-side code. This variant + avoids the lockdep warning that would happen when using (for + example) rcu_dereference() without rcu_read_lock() protection. + Using rcu_dereference_protected() also has the advantage + of permitting compiler optimizations that rcu_dereference() + must prohibit. The rcu_dereference_protected() variant takes + a lockdep expression to indicate which locks must be acquired + by the caller. If the indicated protection is not provided, + a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst + and the API's code comments for more details and example usage. + +.. [2] If the list_for_each_entry_rcu() instance might be used by + update-side code as well as by RCU readers, then an additional + lockdep expression can be added to its list of arguments. + For example, given an additional "lock_is_held(&mylock)" argument, + the RCU lockdep code would complain only if this instance was + invoked outside of an RCU read-side critical section and without + the protection of mylock. + +The following diagram shows how each API communicates among the +reader, updater, and reclaimer. +:: + + + rcu_assign_pointer() + +--------+ + +---------------------->| reader |---------+ + | +--------+ | + | | | + | | | Protect: + | | | rcu_read_lock() + | | | rcu_read_unlock() + | rcu_dereference() | | + +---------+ | | + | updater |<----------------+ | + +---------+ V + | +-----------+ + +----------------------------------->| reclaimer | + +-----------+ + Defer: + synchronize_rcu() & call_rcu() + + +The RCU infrastructure observes the time sequence of rcu_read_lock(), +rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in +order to determine when (1) synchronize_rcu() invocations may return +to their callers and (2) call_rcu() callbacks may be invoked. Efficient +implementations of the RCU infrastructure make heavy use of batching in +order to amortize their overhead over many uses of the corresponding APIs. + +There are at least three flavors of RCU usage in the Linux kernel. The diagram +above shows the most common one. On the updater side, the rcu_assign_pointer(), +sychronize_rcu() and call_rcu() primitives used are the same for all three +flavors. However for protection (on the reader side), the primitives used vary +depending on the flavor: + +a. rcu_read_lock() / rcu_read_unlock() + rcu_dereference() + +b. rcu_read_lock_bh() / rcu_read_unlock_bh() + local_bh_disable() / local_bh_enable() + rcu_dereference_bh() + +c. rcu_read_lock_sched() / rcu_read_unlock_sched() + preempt_disable() / preempt_enable() + local_irq_save() / local_irq_restore() + hardirq enter / hardirq exit + NMI enter / NMI exit + rcu_dereference_sched() + +These three flavors are used as follows: + +a. RCU applied to normal data structures. + +b. RCU applied to networking data structures that may be subjected + to remote denial-of-service attacks. + +c. RCU applied to scheduler and interrupt/NMI-handler tasks. + +Again, most uses will be of (a). The (b) and (c) cases are important +for specialized uses, but are relatively uncommon. + +.. _3_whatisRCU: + +3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? +----------------------------------------------- + +This section shows a simple use of the core RCU API to protect a +global pointer to a dynamically allocated structure. More-typical +uses of RCU may be found in :ref:`listRCU.rst `, +:ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst `. +:: + + struct foo { + int a; + char b; + long c; + }; + DEFINE_SPINLOCK(foo_mutex); + + struct foo __rcu *gbl_foo; + + /* + * Create a new struct foo that is the same as the one currently + * pointed to by gbl_foo, except that field "a" is replaced + * with "new_a". Points gbl_foo to the new structure, and + * frees up the old structure after a grace period. + * + * Uses rcu_assign_pointer() to ensure that concurrent readers + * see the initialized version of the new structure. + * + * Uses synchronize_rcu() to ensure that any readers that might + * have references to the old structure complete before freeing + * the old structure. + */ + void foo_update_a(int new_a) + { + struct foo *new_fp; + struct foo *old_fp; + + new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); + spin_lock(&foo_mutex); + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); + *new_fp = *old_fp; + new_fp->a = new_a; + rcu_assign_pointer(gbl_foo, new_fp); + spin_unlock(&foo_mutex); + synchronize_rcu(); + kfree(old_fp); + } + + /* + * Return the value of field "a" of the current gbl_foo + * structure. Use rcu_read_lock() and rcu_read_unlock() + * to ensure that the structure does not get deleted out + * from under us, and use rcu_dereference() to ensure that + * we see the initialized version of the structure (important + * for DEC Alpha and for people reading the code). + */ + int foo_get_a(void) + { + int retval; + + rcu_read_lock(); + retval = rcu_dereference(gbl_foo)->a; + rcu_read_unlock(); + return retval; + } + +So, to sum up: + +- Use rcu_read_lock() and rcu_read_unlock() to guard RCU + read-side critical sections. + +- Within an RCU read-side critical section, use rcu_dereference() + to dereference RCU-protected pointers. + +- Use some solid scheme (such as locks or semaphores) to + keep concurrent updates from interfering with each other. + +- Use rcu_assign_pointer() to update an RCU-protected pointer. + This primitive protects concurrent readers from the updater, + **not** concurrent updates from each other! You therefore still + need to use locking (or something similar) to keep concurrent + rcu_assign_pointer() primitives from interfering with each other. + +- Use synchronize_rcu() **after** removing a data element from an + RCU-protected data structure, but **before** reclaiming/freeing + the data element, in order to wait for the completion of all + RCU read-side critical sections that might be referencing that + data item. + +See checklist.txt for additional rules to follow when using RCU. +And again, more-typical uses of RCU may be found in :ref:`listRCU.rst +`, :ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst +`. + +.. _4_whatisRCU: + +4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? +-------------------------------------------- + +In the example above, foo_update_a() blocks until a grace period elapses. +This is quite simple, but in some cases one cannot afford to wait so +long -- there might be other high-priority work to be done. + +In such cases, one uses call_rcu() rather than synchronize_rcu(). +The call_rcu() API is as follows:: + + void call_rcu(struct rcu_head * head, + void (*func)(struct rcu_head *head)); + +This function invokes func(head) after a grace period has elapsed. +This invocation might happen from either softirq or process context, +so the function is not permitted to block. The foo struct needs to +have an rcu_head structure added, perhaps as follows:: + + struct foo { + int a; + char b; + long c; + struct rcu_head rcu; + }; + +The foo_update_a() function might then be written as follows:: + + /* + * Create a new struct foo that is the same as the one currently + * pointed to by gbl_foo, except that field "a" is replaced + * with "new_a". Points gbl_foo to the new structure, and + * frees up the old structure after a grace period. + * + * Uses rcu_assign_pointer() to ensure that concurrent readers + * see the initialized version of the new structure. + * + * Uses call_rcu() to ensure that any readers that might have + * references to the old structure complete before freeing the + * old structure. + */ + void foo_update_a(int new_a) + { + struct foo *new_fp; + struct foo *old_fp; + + new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); + spin_lock(&foo_mutex); + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); + *new_fp = *old_fp; + new_fp->a = new_a; + rcu_assign_pointer(gbl_foo, new_fp); + spin_unlock(&foo_mutex); + call_rcu(&old_fp->rcu, foo_reclaim); + } + +The foo_reclaim() function might appear as follows:: + + void foo_reclaim(struct rcu_head *rp) + { + struct foo *fp = container_of(rp, struct foo, rcu); + + foo_cleanup(fp->a); + + kfree(fp); + } + +The container_of() primitive is a macro that, given a pointer into a +struct, the type of the struct, and the pointed-to field within the +struct, returns a pointer to the beginning of the struct. + +The use of call_rcu() permits the caller of foo_update_a() to +immediately regain control, without needing to worry further about the +old version of the newly updated element. It also clearly shows the +RCU distinction between updater, namely foo_update_a(), and reclaimer, +namely foo_reclaim(). + +The summary of advice is the same as for the previous section, except +that we are now using call_rcu() rather than synchronize_rcu(): + +- Use call_rcu() **after** removing a data element from an + RCU-protected data structure in order to register a callback + function that will be invoked after the completion of all RCU + read-side critical sections that might be referencing that + data item. + +If the callback for call_rcu() is not doing anything more than calling +kfree() on the structure, you can use kfree_rcu() instead of call_rcu() +to avoid having to write your own callback:: + + kfree_rcu(old_fp, rcu); + +Again, see checklist.txt for additional rules governing the use of RCU. + +.. _5_whatisRCU: + +5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? +------------------------------------------------ + +One of the nice things about RCU is that it has extremely simple "toy" +implementations that are a good first step towards understanding the +production-quality implementations in the Linux kernel. This section +presents two such "toy" implementations of RCU, one that is implemented +in terms of familiar locking primitives, and another that more closely +resembles "classic" RCU. Both are way too simple for real-world use, +lacking both functionality and performance. However, they are useful +in getting a feel for how RCU works. See kernel/rcu/update.c for a +production-quality implementation, and see: + + http://www.rdrop.com/users/paulmck/RCU + +for papers describing the Linux kernel RCU implementation. The OLS'01 +and OLS'02 papers are a good introduction, and the dissertation provides +more details on the current implementation as of early 2004. + + +5A. "TOY" IMPLEMENTATION #1: LOCKING +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section presents a "toy" RCU implementation that is based on +familiar locking primitives. Its overhead makes it a non-starter for +real-life use, as does its lack of scalability. It is also unsuitable +for realtime use, since it allows scheduling latency to "bleed" from +one read-side critical section to another. It also assumes recursive +reader-writer locks: If you try this with non-recursive locks, and +you allow nested rcu_read_lock() calls, you can deadlock. + +However, it is probably the easiest implementation to relate to, so is +a good starting point. + +It is extremely simple:: + + static DEFINE_RWLOCK(rcu_gp_mutex); + + void rcu_read_lock(void) + { + read_lock(&rcu_gp_mutex); + } + + void rcu_read_unlock(void) + { + read_unlock(&rcu_gp_mutex); + } + + void synchronize_rcu(void) + { + write_lock(&rcu_gp_mutex); + smp_mb__after_spinlock(); + write_unlock(&rcu_gp_mutex); + } + +[You can ignore rcu_assign_pointer() and rcu_dereference() without missing +much. But here are simplified versions anyway. And whatever you do, +don't forget about them when submitting patches making use of RCU!]:: + + #define rcu_assign_pointer(p, v) \ + ({ \ + smp_store_release(&(p), (v)); \ + }) + + #define rcu_dereference(p) \ + ({ \ + typeof(p) _________p1 = READ_ONCE(p); \ + (_________p1); \ + }) + + +The rcu_read_lock() and rcu_read_unlock() primitive read-acquire +and release a global reader-writer lock. The synchronize_rcu() +primitive write-acquires this same lock, then releases it. This means +that once synchronize_rcu() exits, all RCU read-side critical sections +that were in progress before synchronize_rcu() was called are guaranteed +to have completed -- there is no way that synchronize_rcu() would have +been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() +promotes synchronize_rcu() to a full memory barrier in compliance with +the "Memory-Barrier Guarantees" listed in: + + Documentation/RCU/Design/Requirements/Requirements.rst + +It is possible to nest rcu_read_lock(), since reader-writer locks may +be recursively acquired. Note also that rcu_read_lock() is immune +from deadlock (an important property of RCU). The reason for this is +that the only thing that can block rcu_read_lock() is a synchronize_rcu(). +But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, +so there can be no deadlock cycle. + +.. _quiz_1: + +Quick Quiz #1: + Why is this argument naive? How could a deadlock + occur when using this algorithm in a real-world Linux + kernel? How could this deadlock be avoided? + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +5B. "TOY" EXAMPLE #2: CLASSIC RCU +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section presents a "toy" RCU implementation that is based on +"classic RCU". It is also short on performance (but only for updates) and +on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT +kernels. The definitions of rcu_dereference() and rcu_assign_pointer() +are the same as those shown in the preceding section, so they are omitted. +:: + + void rcu_read_lock(void) { } + + void rcu_read_unlock(void) { } + + void synchronize_rcu(void) + { + int cpu; + + for_each_possible_cpu(cpu) + run_on(cpu); + } + +Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing. +This is the great strength of classic RCU in a non-preemptive kernel: +read-side overhead is precisely zero, at least on non-Alpha CPUs. +And there is absolutely no way that rcu_read_lock() can possibly +participate in a deadlock cycle! + +The implementation of synchronize_rcu() simply schedules itself on each +CPU in turn. The run_on() primitive can be implemented straightforwardly +in terms of the sched_setaffinity() primitive. Of course, a somewhat less +"toy" implementation would restore the affinity upon completion rather +than just leaving all tasks running on the last CPU, but when I said +"toy", I meant **toy**! + +So how the heck is this supposed to work??? + +Remember that it is illegal to block while in an RCU read-side critical +section. Therefore, if a given CPU executes a context switch, we know +that it must have completed all preceding RCU read-side critical sections. +Once **all** CPUs have executed a context switch, then **all** preceding +RCU read-side critical sections will have completed. + +So, suppose that we remove a data item from its structure and then invoke +synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed +that there are no RCU read-side critical sections holding a reference +to that data item, so we can safely reclaim it. + +.. _quiz_2: + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +.. _quiz_3: + +Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in + PREEMPT_RT, where normal spinlocks can block??? + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +.. _6_whatisRCU: + +6. ANALOGY WITH READER-WRITER LOCKING +-------------------------------------- + +Although RCU can be used in many different ways, a very common use of +RCU is analogous to reader-writer locking. The following unified +diff shows how closely related RCU and reader-writer locking can be. +:: + + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + + @@ -13,15 +14,15 @@ + struct list_head *lp; + struct el *p; + + - read_lock(&listmutex); + - list_for_each_entry(p, head, lp) { + + rcu_read_lock(); + + list_for_each_entry_rcu(p, head, lp) { + if (p->key == key) { + *result = p->data; + - read_unlock(&listmutex); + + rcu_read_unlock(); + return 1; + } + } + - read_unlock(&listmutex); + + rcu_read_unlock(); + return 0; + } + + @@ -29,15 +30,16 @@ + { + struct el *p; + + - write_lock(&listmutex); + + spin_lock(&listmutex); + list_for_each_entry(p, head, lp) { + if (p->key == key) { + - list_del(&p->list); + - write_unlock(&listmutex); + + list_del_rcu(&p->list); + + spin_unlock(&listmutex); + + synchronize_rcu(); + kfree(p); + return 1; + } + } + - write_unlock(&listmutex); + + spin_unlock(&listmutex); + return 0; + } + +Or, for those who prefer a side-by-side listing:: + + 1 struct el { 1 struct el { + 2 struct list_head list; 2 struct list_head list; + 3 long key; 3 long key; + 4 spinlock_t mutex; 4 spinlock_t mutex; + 5 int data; 5 int data; + 6 /* Other data fields */ 6 /* Other data fields */ + 7 }; 7 }; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; + 9 struct el head; 9 struct el head; + +:: + + 1 int search(long key, int *result) 1 int search(long key, int *result) + 2 { 2 { + 3 struct list_head *lp; 3 struct list_head *lp; + 4 struct el *p; 4 struct el *p; + 5 5 + 6 read_lock(&listmutex); 6 rcu_read_lock(); + 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { + 8 if (p->key == key) { 8 if (p->key == key) { + 9 *result = p->data; 9 *result = p->data; + 10 read_unlock(&listmutex); 10 rcu_read_unlock(); + 11 return 1; 11 return 1; + 12 } 12 } + 13 } 13 } + 14 read_unlock(&listmutex); 14 rcu_read_unlock(); + 15 return 0; 15 return 0; + 16 } 16 } + +:: + + 1 int delete(long key) 1 int delete(long key) + 2 { 2 { + 3 struct el *p; 3 struct el *p; + 4 4 + 5 write_lock(&listmutex); 5 spin_lock(&listmutex); + 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { + 7 if (p->key == key) { 7 if (p->key == key) { + 8 list_del(&p->list); 8 list_del_rcu(&p->list); + 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); + 10 synchronize_rcu(); + 10 kfree(p); 11 kfree(p); + 11 return 1; 12 return 1; + 12 } 13 } + 13 } 14 } + 14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); + 15 return 0; 16 return 0; + 16 } 17 } + +Either way, the differences are quite small. Read-side locking moves +to rcu_read_lock() and rcu_read_unlock, update-side locking moves from +a reader-writer lock to a simple spinlock, and a synchronize_rcu() +precedes the kfree(). + +However, there is one potential catch: the read-side and update-side +critical sections can now run concurrently. In many cases, this will +not be a problem, but it is necessary to check carefully regardless. +For example, if multiple independent list updates must be seen as +a single atomic update, converting to RCU will require special care. + +Also, the presence of synchronize_rcu() means that the RCU version of +delete() can now block. If this is a problem, there is a callback-based +mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can +be used in place of synchronize_rcu(). + +.. _7_whatisRCU: + +7. FULL LIST OF RCU APIs +------------------------- + +The RCU APIs are documented in docbook-format header comments in the +Linux-kernel source code, but it helps to have a full list of the +APIs, since there does not appear to be a way to categorize them +in docbook. Here is the list, by category. + +RCU list traversal:: + + list_entry_rcu + list_first_entry_rcu + list_next_rcu + list_for_each_entry_rcu + list_for_each_entry_continue_rcu + list_for_each_entry_from_rcu + hlist_first_rcu + hlist_next_rcu + hlist_pprev_rcu + hlist_for_each_entry_rcu + hlist_for_each_entry_rcu_bh + hlist_for_each_entry_from_rcu + hlist_for_each_entry_continue_rcu + hlist_for_each_entry_continue_rcu_bh + hlist_nulls_first_rcu + hlist_nulls_for_each_entry_rcu + hlist_bl_first_rcu + hlist_bl_for_each_entry_rcu + +RCU pointer/list udate:: + + rcu_assign_pointer + list_add_rcu + list_add_tail_rcu + list_del_rcu + list_replace_rcu + hlist_add_behind_rcu + hlist_add_before_rcu + hlist_add_head_rcu + hlist_del_rcu + hlist_del_init_rcu + hlist_replace_rcu + list_splice_init_rcu() + hlist_nulls_del_init_rcu + hlist_nulls_del_rcu + hlist_nulls_add_head_rcu + hlist_bl_add_head_rcu + hlist_bl_del_init_rcu + hlist_bl_del_rcu + hlist_bl_set_first_rcu + +RCU:: + + Critical sections Grace period Barrier + + rcu_read_lock synchronize_net rcu_barrier + rcu_read_unlock synchronize_rcu + rcu_dereference synchronize_rcu_expedited + rcu_read_lock_held call_rcu + rcu_dereference_check kfree_rcu + rcu_dereference_protected + +bh:: + + Critical sections Grace period Barrier + + rcu_read_lock_bh call_rcu rcu_barrier + rcu_read_unlock_bh synchronize_rcu + [local_bh_disable] synchronize_rcu_expedited + [and friends] + rcu_dereference_bh + rcu_dereference_bh_check + rcu_dereference_bh_protected + rcu_read_lock_bh_held + +sched:: + + Critical sections Grace period Barrier + + rcu_read_lock_sched call_rcu rcu_barrier + rcu_read_unlock_sched synchronize_rcu + [preempt_disable] synchronize_rcu_expedited + [and friends] + rcu_read_lock_sched_notrace + rcu_read_unlock_sched_notrace + rcu_dereference_sched + rcu_dereference_sched_check + rcu_dereference_sched_protected + rcu_read_lock_sched_held + + +SRCU:: + + Critical sections Grace period Barrier + + srcu_read_lock call_srcu srcu_barrier + srcu_read_unlock synchronize_srcu + srcu_dereference synchronize_srcu_expedited + srcu_dereference_check + srcu_read_lock_held + +SRCU: Initialization/cleanup:: + + DEFINE_SRCU + DEFINE_STATIC_SRCU + init_srcu_struct + cleanup_srcu_struct + +All: lockdep-checked RCU-protected pointer access:: + + rcu_access_pointer + rcu_dereference_raw + RCU_LOCKDEP_WARN + rcu_sleep_check + RCU_NONIDLE + +See the comment headers in the source code (or the docbook generated +from them) for more information. + +However, given that there are no fewer than four families of RCU APIs +in the Linux kernel, how do you choose which one to use? The following +list can be helpful: + +a. Will readers need to block? If so, you need SRCU. + +b. What about the -rt patchset? If readers would need to block + in an non-rt kernel, you need SRCU. If readers would block + in a -rt kernel, but not in a non-rt kernel, SRCU is not + necessary. (The -rt patchset turns spinlocks into sleeplocks, + hence this distinction.) + +c. Do you need to treat NMI handlers, hardirq handlers, + and code segments with preemption disabled (whether + via preempt_disable(), local_irq_save(), local_bh_disable(), + or some other mechanism) as if they were explicit RCU readers? + If so, RCU-sched is the only choice that will work for you. + +d. Do you need RCU grace periods to complete even in the face + of softirq monopolization of one or more of the CPUs? For + example, is your code subject to network-based denial-of-service + attacks? If so, you should disable softirq across your readers, + for example, by using rcu_read_lock_bh(). + +e. Is your workload too update-intensive for normal use of + RCU, but inappropriate for other synchronization mechanisms? + If so, consider SLAB_TYPESAFE_BY_RCU (which was originally + named SLAB_DESTROY_BY_RCU). But please be careful! + +f. Do you need read-side critical sections that are respected + even though they are in the middle of the idle loop, during + user-mode execution, or on an offlined CPU? If so, SRCU is the + only choice that will work for you. + +g. Otherwise, use RCU. + +Of course, this all assumes that you have determined that RCU is in fact +the right tool for your job. + +.. _8_whatisRCU: + +8. ANSWERS TO QUICK QUIZZES +---------------------------- + +Quick Quiz #1: + Why is this argument naive? How could a deadlock + occur when using this algorithm in a real-world Linux + kernel? [Referring to the lock-based "toy" RCU + algorithm.] + +Answer: + Consider the following sequence of events: + + 1. CPU 0 acquires some unrelated lock, call it + "problematic_lock", disabling irq via + spin_lock_irqsave(). + + 2. CPU 1 enters synchronize_rcu(), write-acquiring + rcu_gp_mutex. + + 3. CPU 0 enters rcu_read_lock(), but must wait + because CPU 1 holds rcu_gp_mutex. + + 4. CPU 1 is interrupted, and the irq handler + attempts to acquire problematic_lock. + + The system is now deadlocked. + + One way to avoid this deadlock is to use an approach like + that of CONFIG_PREEMPT_RT, where all normal spinlocks + become blocking locks, and all irq handlers execute in + the context of special tasks. In this case, in step 4 + above, the irq handler would block, allowing CPU 1 to + release rcu_gp_mutex, avoiding the deadlock. + + Even in the absence of deadlock, this RCU implementation + allows latency to "bleed" from readers to other + readers through synchronize_rcu(). To see this, + consider task A in an RCU read-side critical section + (thus read-holding rcu_gp_mutex), task B blocked + attempting to write-acquire rcu_gp_mutex, and + task C blocked in rcu_read_lock() attempting to + read_acquire rcu_gp_mutex. Task A's RCU read-side + latency is holding up task C, albeit indirectly via + task B. + + Realtime RCU implementations therefore use a counter-based + approach where tasks in RCU read-side critical sections + cannot be blocked by tasks executing synchronize_rcu(). + +:ref:`Back to Quick Quiz #1 ` + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. + +Answer: + Imagine a single-CPU system with a non-CONFIG_PREEMPT + kernel where a routing table is used by process-context + code, but can be updated by irq-context code (for example, + by an "ICMP REDIRECT" packet). The usual way of handling + this would be to have the process-context code disable + interrupts while searching the routing table. Use of + RCU allows such interrupt-disabling to be dispensed with. + Thus, without RCU, you pay the cost of disabling interrupts, + and with RCU you don't. + + One can argue that the overhead of RCU in this + case is negative with respect to the single-CPU + interrupt-disabling approach. Others might argue that + the overhead of RCU is merely zero, and that replacing + the positive overhead of the interrupt-disabling scheme + with the zero-overhead RCU scheme does not constitute + negative overhead. + + In real life, of course, things are more complex. But + even the theoretical possibility of negative overhead for + a synchronization primitive is a bit unexpected. ;-) + +:ref:`Back to Quick Quiz #2 ` + +Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in + PREEMPT_RT, where normal spinlocks can block??? + +Answer: + Just as PREEMPT_RT permits preemption of spinlock + critical sections, it permits preemption of RCU + read-side critical sections. It also permits + spinlocks blocking while in RCU read-side critical + sections. + + Why the apparent inconsistency? Because it is + possible to use priority boosting to keep the RCU + grace periods short if need be (for example, if running + short of memory). In contrast, if blocking waiting + for (say) network reception, there is no way to know + what should be boosted. Especially given that the + process we need to boost might well be a human being + who just went out for a pizza or something. And although + a computer-operated cattle prod might arouse serious + interest, it might also provoke serious objections. + Besides, how does the computer know what pizza parlor + the human being went to??? + +:ref:`Back to Quick Quiz #3 ` + +ACKNOWLEDGEMENTS + +My thanks to the people who helped make this human-readable, including +Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern. + + +For more information, see http://www.rdrop.com/users/paulmck/RCU. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt deleted file mode 100644 index 58ba05c4d97f..000000000000 --- a/Documentation/RCU/whatisRCU.txt +++ /dev/null @@ -1,1079 +0,0 @@ -What is RCU? -- "Read, Copy, Update" - -Please note that the "What is RCU?" LWN series is an excellent place -to start learning about RCU: - -1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ -2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ -3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ -4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ - 2010 Big API Table http://lwn.net/Articles/419086/ -5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ - 2014 Big API Table http://lwn.net/Articles/609973/ - - -What is RCU? - -RCU is a synchronization mechanism that was added to the Linux kernel -during the 2.5 development effort that is optimized for read-mostly -situations. Although RCU is actually quite simple once you understand it, -getting there can sometimes be a challenge. Part of the problem is that -most of the past descriptions of RCU have been written with the mistaken -assumption that there is "one true way" to describe RCU. Instead, -the experience has been that different people must take different paths -to arrive at an understanding of RCU. This document provides several -different paths, as follows: - -1. RCU OVERVIEW -2. WHAT IS RCU'S CORE API? -3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? -4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? -5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? -6. ANALOGY WITH READER-WRITER LOCKING -7. FULL LIST OF RCU APIs -8. ANSWERS TO QUICK QUIZZES - -People who prefer starting with a conceptual overview should focus on -Section 1, though most readers will profit by reading this section at -some point. People who prefer to start with an API that they can then -experiment with should focus on Section 2. People who prefer to start -with example uses should focus on Sections 3 and 4. People who need to -understand the RCU implementation should focus on Section 5, then dive -into the kernel source code. People who reason best by analogy should -focus on Section 6. Section 7 serves as an index to the docbook API -documentation, and Section 8 is the traditional answer key. - -So, start with the section that makes the most sense to you and your -preferred method of learning. If you need to know everything about -everything, feel free to read the whole thing -- but if you are really -that type of person, you have perused the source code and will therefore -never need this document anyway. ;-) - - -1. RCU OVERVIEW - -The basic idea behind RCU is to split updates into "removal" and -"reclamation" phases. The removal phase removes references to data items -within a data structure (possibly by replacing them with references to -new versions of these data items), and can run concurrently with readers. -The reason that it is safe to run the removal phase concurrently with -readers is the semantics of modern CPUs guarantee that readers will see -either the old or the new version of the data structure rather than a -partially updated reference. The reclamation phase does the work of reclaiming -(e.g., freeing) the data items removed from the data structure during the -removal phase. Because reclaiming data items can disrupt any readers -concurrently referencing those data items, the reclamation phase must -not start until readers no longer hold references to those data items. - -Splitting the update into removal and reclamation phases permits the -updater to perform the removal phase immediately, and to defer the -reclamation phase until all readers active during the removal phase have -completed, either by blocking until they finish or by registering a -callback that is invoked after they finish. Only readers that are active -during the removal phase need be considered, because any reader starting -after the removal phase will be unable to gain a reference to the removed -data items, and therefore cannot be disrupted by the reclamation phase. - -So the typical RCU update sequence goes something like the following: - -a. Remove pointers to a data structure, so that subsequent - readers cannot gain a reference to it. - -b. Wait for all previous readers to complete their RCU read-side - critical sections. - -c. At this point, there cannot be any readers who hold references - to the data structure, so it now may safely be reclaimed - (e.g., kfree()d). - -Step (b) above is the key idea underlying RCU's deferred destruction. -The ability to wait until all readers are done allows RCU readers to -use much lighter-weight synchronization, in some cases, absolutely no -synchronization at all. In contrast, in more conventional lock-based -schemes, readers must use heavy-weight synchronization in order to -prevent an updater from deleting the data structure out from under them. -This is because lock-based updaters typically update data items in place, -and must therefore exclude readers. In contrast, RCU-based updaters -typically take advantage of the fact that writes to single aligned -pointers are atomic on modern CPUs, allowing atomic insertion, removal, -and replacement of data items in a linked structure without disrupting -readers. Concurrent RCU readers can then continue accessing the old -versions, and can dispense with the atomic operations, memory barriers, -and communications cache misses that are so expensive on present-day -SMP computer systems, even in absence of lock contention. - -In the three-step procedure shown above, the updater is performing both -the removal and the reclamation step, but it is often helpful for an -entirely different thread to do the reclamation, as is in fact the case -in the Linux kernel's directory-entry cache (dcache). Even if the same -thread performs both the update step (step (a) above) and the reclamation -step (step (c) above), it is often helpful to think of them separately. -For example, RCU readers and updaters need not communicate at all, -but RCU provides implicit low-overhead communication between readers -and reclaimers, namely, in step (b) above. - -So how the heck can a reclaimer tell when a reader is done, given -that readers are not doing any sort of synchronization operations??? -Read on to learn about how RCU's API makes this easy. - - -2. WHAT IS RCU'S CORE API? - -The core RCU API is quite small: - -a. rcu_read_lock() -b. rcu_read_unlock() -c. synchronize_rcu() / call_rcu() -d. rcu_assign_pointer() -e. rcu_dereference() - -There are many other members of the RCU API, but the rest can be -expressed in terms of these five, though most implementations instead -express synchronize_rcu() in terms of the call_rcu() callback API. - -The five core RCU APIs are described below, the other 18 will be enumerated -later. See the kernel docbook documentation for more info, or look directly -at the function header comments. - -rcu_read_lock() - - void rcu_read_lock(void); - - Used by a reader to inform the reclaimer that the reader is - entering an RCU read-side critical section. It is illegal - to block while in an RCU read-side critical section, though - kernels built with CONFIG_PREEMPT_RCU can preempt RCU - read-side critical sections. Any RCU-protected data structure - accessed during an RCU read-side critical section is guaranteed to - remain unreclaimed for the full duration of that critical section. - Reference counts may be used in conjunction with RCU to maintain - longer-term references to data structures. - -rcu_read_unlock() - - void rcu_read_unlock(void); - - Used by a reader to inform the reclaimer that the reader is - exiting an RCU read-side critical section. Note that RCU - read-side critical sections may be nested and/or overlapping. - -synchronize_rcu() - - void synchronize_rcu(void); - - Marks the end of updater code and the beginning of reclaimer - code. It does this by blocking until all pre-existing RCU - read-side critical sections on all CPUs have completed. - Note that synchronize_rcu() will -not- necessarily wait for - any subsequent RCU read-side critical sections to complete. - For example, consider the following sequence of events: - - CPU 0 CPU 1 CPU 2 - ----------------- ------------------------- --------------- - 1. rcu_read_lock() - 2. enters synchronize_rcu() - 3. rcu_read_lock() - 4. rcu_read_unlock() - 5. exits synchronize_rcu() - 6. rcu_read_unlock() - - To reiterate, synchronize_rcu() waits only for ongoing RCU - read-side critical sections to complete, not necessarily for - any that begin after synchronize_rcu() is invoked. - - Of course, synchronize_rcu() does not necessarily return - -immediately- after the last pre-existing RCU read-side critical - section completes. For one thing, there might well be scheduling - delays. For another thing, many RCU implementations process - requests in batches in order to improve efficiencies, which can - further delay synchronize_rcu(). - - Since synchronize_rcu() is the API that must figure out when - readers are done, its implementation is key to RCU. For RCU - to be useful in all but the most read-intensive situations, - synchronize_rcu()'s overhead must also be quite small. - - The call_rcu() API is a callback form of synchronize_rcu(), - and is described in more detail in a later section. Instead of - blocking, it registers a function and argument which are invoked - after all ongoing RCU read-side critical sections have completed. - This callback variant is particularly useful in situations where - it is illegal to block or where update-side performance is - critically important. - - However, the call_rcu() API should not be used lightly, as use - of the synchronize_rcu() API generally results in simpler code. - In addition, the synchronize_rcu() API has the nice property - of automatically limiting update rate should grace periods - be delayed. This property results in system resilience in face - of denial-of-service attacks. Code using call_rcu() should limit - update rate in order to gain this same sort of resilience. See - checklist.txt for some approaches to limiting the update rate. - -rcu_assign_pointer() - - void rcu_assign_pointer(p, typeof(p) v); - - Yes, rcu_assign_pointer() -is- implemented as a macro, though it - would be cool to be able to declare a function in this manner. - (Compiler experts will no doubt disagree.) - - The updater uses this function to assign a new value to an - RCU-protected pointer, in order to safely communicate the change - in value from the updater to the reader. This macro does not - evaluate to an rvalue, but it does execute any memory-barrier - instructions required for a given CPU architecture. - - Perhaps just as important, it serves to document (1) which - pointers are protected by RCU and (2) the point at which a - given structure becomes accessible to other CPUs. That said, - rcu_assign_pointer() is most frequently used indirectly, via - the _rcu list-manipulation primitives such as list_add_rcu(). - -rcu_dereference() - - typeof(p) rcu_dereference(p); - - Like rcu_assign_pointer(), rcu_dereference() must be implemented - as a macro. - - The reader uses rcu_dereference() to fetch an RCU-protected - pointer, which returns a value that may then be safely - dereferenced. Note that rcu_dereference() does not actually - dereference the pointer, instead, it protects the pointer for - later dereferencing. It also executes any needed memory-barrier - instructions for a given CPU architecture. Currently, only Alpha - needs memory barriers within rcu_dereference() -- on other CPUs, - it compiles to nothing, not even a compiler directive. - - Common coding practice uses rcu_dereference() to copy an - RCU-protected pointer to a local variable, then dereferences - this local variable, for example as follows: - - p = rcu_dereference(head.next); - return p->data; - - However, in this case, one could just as easily combine these - into one statement: - - return rcu_dereference(head.next)->data; - - If you are going to be fetching multiple fields from the - RCU-protected structure, using the local variable is of - course preferred. Repeated rcu_dereference() calls look - ugly, do not guarantee that the same pointer will be returned - if an update happened while in the critical section, and incur - unnecessary overhead on Alpha CPUs. - - Note that the value returned by rcu_dereference() is valid - only within the enclosing RCU read-side critical section [1]. - For example, the following is -not- legal: - - rcu_read_lock(); - p = rcu_dereference(head.next); - rcu_read_unlock(); - x = p->address; /* BUG!!! */ - rcu_read_lock(); - y = p->data; /* BUG!!! */ - rcu_read_unlock(); - - Holding a reference from one RCU read-side critical section - to another is just as illegal as holding a reference from - one lock-based critical section to another! Similarly, - using a reference outside of the critical section in which - it was acquired is just as illegal as doing so with normal - locking. - - As with rcu_assign_pointer(), an important function of - rcu_dereference() is to document which pointers are protected by - RCU, in particular, flagging a pointer that is subject to changing - at any time, including immediately after the rcu_dereference(). - And, again like rcu_assign_pointer(), rcu_dereference() is - typically used indirectly, via the _rcu list-manipulation - primitives, such as list_for_each_entry_rcu() [2]. - - [1] The variant rcu_dereference_protected() can be used outside - of an RCU read-side critical section as long as the usage is - protected by locks acquired by the update-side code. This variant - avoids the lockdep warning that would happen when using (for - example) rcu_dereference() without rcu_read_lock() protection. - Using rcu_dereference_protected() also has the advantage - of permitting compiler optimizations that rcu_dereference() - must prohibit. The rcu_dereference_protected() variant takes - a lockdep expression to indicate which locks must be acquired - by the caller. If the indicated protection is not provided, - a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst - and the API's code comments for more details and example usage. - - [2] If the list_for_each_entry_rcu() instance might be used by - update-side code as well as by RCU readers, then an additional - lockdep expression can be added to its list of arguments. - For example, given an additional "lock_is_held(&mylock)" argument, - the RCU lockdep code would complain only if this instance was - invoked outside of an RCU read-side critical section and without - the protection of mylock. - -The following diagram shows how each API communicates among the -reader, updater, and reclaimer. - - - rcu_assign_pointer() - +--------+ - +---------------------->| reader |---------+ - | +--------+ | - | | | - | | | Protect: - | | | rcu_read_lock() - | | | rcu_read_unlock() - | rcu_dereference() | | - +---------+ | | - | updater |<----------------+ | - +---------+ V - | +-----------+ - +----------------------------------->| reclaimer | - +-----------+ - Defer: - synchronize_rcu() & call_rcu() - - -The RCU infrastructure observes the time sequence of rcu_read_lock(), -rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in -order to determine when (1) synchronize_rcu() invocations may return -to their callers and (2) call_rcu() callbacks may be invoked. Efficient -implementations of the RCU infrastructure make heavy use of batching in -order to amortize their overhead over many uses of the corresponding APIs. - -There are at least three flavors of RCU usage in the Linux kernel. The diagram -above shows the most common one. On the updater side, the rcu_assign_pointer(), -sychronize_rcu() and call_rcu() primitives used are the same for all three -flavors. However for protection (on the reader side), the primitives used vary -depending on the flavor: - -a. rcu_read_lock() / rcu_read_unlock() - rcu_dereference() - -b. rcu_read_lock_bh() / rcu_read_unlock_bh() - local_bh_disable() / local_bh_enable() - rcu_dereference_bh() - -c. rcu_read_lock_sched() / rcu_read_unlock_sched() - preempt_disable() / preempt_enable() - local_irq_save() / local_irq_restore() - hardirq enter / hardirq exit - NMI enter / NMI exit - rcu_dereference_sched() - -These three flavors are used as follows: - -a. RCU applied to normal data structures. - -b. RCU applied to networking data structures that may be subjected - to remote denial-of-service attacks. - -c. RCU applied to scheduler and interrupt/NMI-handler tasks. - -Again, most uses will be of (a). The (b) and (c) cases are important -for specialized uses, but are relatively uncommon. - - -3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? - -This section shows a simple use of the core RCU API to protect a -global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. - - struct foo { - int a; - char b; - long c; - }; - DEFINE_SPINLOCK(foo_mutex); - - struct foo __rcu *gbl_foo; - - /* - * Create a new struct foo that is the same as the one currently - * pointed to by gbl_foo, except that field "a" is replaced - * with "new_a". Points gbl_foo to the new structure, and - * frees up the old structure after a grace period. - * - * Uses rcu_assign_pointer() to ensure that concurrent readers - * see the initialized version of the new structure. - * - * Uses synchronize_rcu() to ensure that any readers that might - * have references to the old structure complete before freeing - * the old structure. - */ - void foo_update_a(int new_a) - { - struct foo *new_fp; - struct foo *old_fp; - - new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); - spin_lock(&foo_mutex); - old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); - *new_fp = *old_fp; - new_fp->a = new_a; - rcu_assign_pointer(gbl_foo, new_fp); - spin_unlock(&foo_mutex); - synchronize_rcu(); - kfree(old_fp); - } - - /* - * Return the value of field "a" of the current gbl_foo - * structure. Use rcu_read_lock() and rcu_read_unlock() - * to ensure that the structure does not get deleted out - * from under us, and use rcu_dereference() to ensure that - * we see the initialized version of the structure (important - * for DEC Alpha and for people reading the code). - */ - int foo_get_a(void) - { - int retval; - - rcu_read_lock(); - retval = rcu_dereference(gbl_foo)->a; - rcu_read_unlock(); - return retval; - } - -So, to sum up: - -o Use rcu_read_lock() and rcu_read_unlock() to guard RCU - read-side critical sections. - -o Within an RCU read-side critical section, use rcu_dereference() - to dereference RCU-protected pointers. - -o Use some solid scheme (such as locks or semaphores) to - keep concurrent updates from interfering with each other. - -o Use rcu_assign_pointer() to update an RCU-protected pointer. - This primitive protects concurrent readers from the updater, - -not- concurrent updates from each other! You therefore still - need to use locking (or something similar) to keep concurrent - rcu_assign_pointer() primitives from interfering with each other. - -o Use synchronize_rcu() -after- removing a data element from an - RCU-protected data structure, but -before- reclaiming/freeing - the data element, in order to wait for the completion of all - RCU read-side critical sections that might be referencing that - data item. - -See checklist.txt for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in listRCU.txt, -arrayRCU.txt, and NMI-RCU.txt. - - -4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? - -In the example above, foo_update_a() blocks until a grace period elapses. -This is quite simple, but in some cases one cannot afford to wait so -long -- there might be other high-priority work to be done. - -In such cases, one uses call_rcu() rather than synchronize_rcu(). -The call_rcu() API is as follows: - - void call_rcu(struct rcu_head * head, - void (*func)(struct rcu_head *head)); - -This function invokes func(head) after a grace period has elapsed. -This invocation might happen from either softirq or process context, -so the function is not permitted to block. The foo struct needs to -have an rcu_head structure added, perhaps as follows: - - struct foo { - int a; - char b; - long c; - struct rcu_head rcu; - }; - -The foo_update_a() function might then be written as follows: - - /* - * Create a new struct foo that is the same as the one currently - * pointed to by gbl_foo, except that field "a" is replaced - * with "new_a". Points gbl_foo to the new structure, and - * frees up the old structure after a grace period. - * - * Uses rcu_assign_pointer() to ensure that concurrent readers - * see the initialized version of the new structure. - * - * Uses call_rcu() to ensure that any readers that might have - * references to the old structure complete before freeing the - * old structure. - */ - void foo_update_a(int new_a) - { - struct foo *new_fp; - struct foo *old_fp; - - new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); - spin_lock(&foo_mutex); - old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); - *new_fp = *old_fp; - new_fp->a = new_a; - rcu_assign_pointer(gbl_foo, new_fp); - spin_unlock(&foo_mutex); - call_rcu(&old_fp->rcu, foo_reclaim); - } - -The foo_reclaim() function might appear as follows: - - void foo_reclaim(struct rcu_head *rp) - { - struct foo *fp = container_of(rp, struct foo, rcu); - - foo_cleanup(fp->a); - - kfree(fp); - } - -The container_of() primitive is a macro that, given a pointer into a -struct, the type of the struct, and the pointed-to field within the -struct, returns a pointer to the beginning of the struct. - -The use of call_rcu() permits the caller of foo_update_a() to -immediately regain control, without needing to worry further about the -old version of the newly updated element. It also clearly shows the -RCU distinction between updater, namely foo_update_a(), and reclaimer, -namely foo_reclaim(). - -The summary of advice is the same as for the previous section, except -that we are now using call_rcu() rather than synchronize_rcu(): - -o Use call_rcu() -after- removing a data element from an - RCU-protected data structure in order to register a callback - function that will be invoked after the completion of all RCU - read-side critical sections that might be referencing that - data item. - -If the callback for call_rcu() is not doing anything more than calling -kfree() on the structure, you can use kfree_rcu() instead of call_rcu() -to avoid having to write your own callback: - - kfree_rcu(old_fp, rcu); - -Again, see checklist.txt for additional rules governing the use of RCU. - - -5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? - -One of the nice things about RCU is that it has extremely simple "toy" -implementations that are a good first step towards understanding the -production-quality implementations in the Linux kernel. This section -presents two such "toy" implementations of RCU, one that is implemented -in terms of familiar locking primitives, and another that more closely -resembles "classic" RCU. Both are way too simple for real-world use, -lacking both functionality and performance. However, they are useful -in getting a feel for how RCU works. See kernel/rcu/update.c for a -production-quality implementation, and see: - - http://www.rdrop.com/users/paulmck/RCU - -for papers describing the Linux kernel RCU implementation. The OLS'01 -and OLS'02 papers are a good introduction, and the dissertation provides -more details on the current implementation as of early 2004. - - -5A. "TOY" IMPLEMENTATION #1: LOCKING - -This section presents a "toy" RCU implementation that is based on -familiar locking primitives. Its overhead makes it a non-starter for -real-life use, as does its lack of scalability. It is also unsuitable -for realtime use, since it allows scheduling latency to "bleed" from -one read-side critical section to another. It also assumes recursive -reader-writer locks: If you try this with non-recursive locks, and -you allow nested rcu_read_lock() calls, you can deadlock. - -However, it is probably the easiest implementation to relate to, so is -a good starting point. - -It is extremely simple: - - static DEFINE_RWLOCK(rcu_gp_mutex); - - void rcu_read_lock(void) - { - read_lock(&rcu_gp_mutex); - } - - void rcu_read_unlock(void) - { - read_unlock(&rcu_gp_mutex); - } - - void synchronize_rcu(void) - { - write_lock(&rcu_gp_mutex); - smp_mb__after_spinlock(); - write_unlock(&rcu_gp_mutex); - } - -[You can ignore rcu_assign_pointer() and rcu_dereference() without missing -much. But here are simplified versions anyway. And whatever you do, -don't forget about them when submitting patches making use of RCU!] - - #define rcu_assign_pointer(p, v) \ - ({ \ - smp_store_release(&(p), (v)); \ - }) - - #define rcu_dereference(p) \ - ({ \ - typeof(p) _________p1 = READ_ONCE(p); \ - (_________p1); \ - }) - - -The rcu_read_lock() and rcu_read_unlock() primitive read-acquire -and release a global reader-writer lock. The synchronize_rcu() -primitive write-acquires this same lock, then releases it. This means -that once synchronize_rcu() exits, all RCU read-side critical sections -that were in progress before synchronize_rcu() was called are guaranteed -to have completed -- there is no way that synchronize_rcu() would have -been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() -promotes synchronize_rcu() to a full memory barrier in compliance with -the "Memory-Barrier Guarantees" listed in: - - Documentation/RCU/Design/Requirements/Requirements.rst - -It is possible to nest rcu_read_lock(), since reader-writer locks may -be recursively acquired. Note also that rcu_read_lock() is immune -from deadlock (an important property of RCU). The reason for this is -that the only thing that can block rcu_read_lock() is a synchronize_rcu(). -But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, -so there can be no deadlock cycle. - -Quick Quiz #1: Why is this argument naive? How could a deadlock - occur when using this algorithm in a real-world Linux - kernel? How could this deadlock be avoided? - - -5B. "TOY" EXAMPLE #2: CLASSIC RCU - -This section presents a "toy" RCU implementation that is based on -"classic RCU". It is also short on performance (but only for updates) and -on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT -kernels. The definitions of rcu_dereference() and rcu_assign_pointer() -are the same as those shown in the preceding section, so they are omitted. - - void rcu_read_lock(void) { } - - void rcu_read_unlock(void) { } - - void synchronize_rcu(void) - { - int cpu; - - for_each_possible_cpu(cpu) - run_on(cpu); - } - -Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing. -This is the great strength of classic RCU in a non-preemptive kernel: -read-side overhead is precisely zero, at least on non-Alpha CPUs. -And there is absolutely no way that rcu_read_lock() can possibly -participate in a deadlock cycle! - -The implementation of synchronize_rcu() simply schedules itself on each -CPU in turn. The run_on() primitive can be implemented straightforwardly -in terms of the sched_setaffinity() primitive. Of course, a somewhat less -"toy" implementation would restore the affinity upon completion rather -than just leaving all tasks running on the last CPU, but when I said -"toy", I meant -toy-! - -So how the heck is this supposed to work??? - -Remember that it is illegal to block while in an RCU read-side critical -section. Therefore, if a given CPU executes a context switch, we know -that it must have completed all preceding RCU read-side critical sections. -Once -all- CPUs have executed a context switch, then -all- preceding -RCU read-side critical sections will have completed. - -So, suppose that we remove a data item from its structure and then invoke -synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed -that there are no RCU read-side critical sections holding a reference -to that data item, so we can safely reclaim it. - -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. - -Quick Quiz #3: If it is illegal to block in an RCU read-side - critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? - - -6. ANALOGY WITH READER-WRITER LOCKING - -Although RCU can be used in many different ways, a very common use of -RCU is analogous to reader-writer locking. The following unified -diff shows how closely related RCU and reader-writer locking can be. - - @@ -5,5 +5,5 @@ struct el { - int data; - /* Other data fields */ - }; - -rwlock_t listmutex; - +spinlock_t listmutex; - struct el head; - - @@ -13,15 +14,15 @@ - struct list_head *lp; - struct el *p; - - - read_lock(&listmutex); - - list_for_each_entry(p, head, lp) { - + rcu_read_lock(); - + list_for_each_entry_rcu(p, head, lp) { - if (p->key == key) { - *result = p->data; - - read_unlock(&listmutex); - + rcu_read_unlock(); - return 1; - } - } - - read_unlock(&listmutex); - + rcu_read_unlock(); - return 0; - } - - @@ -29,15 +30,16 @@ - { - struct el *p; - - - write_lock(&listmutex); - + spin_lock(&listmutex); - list_for_each_entry(p, head, lp) { - if (p->key == key) { - - list_del(&p->list); - - write_unlock(&listmutex); - + list_del_rcu(&p->list); - + spin_unlock(&listmutex); - + synchronize_rcu(); - kfree(p); - return 1; - } - } - - write_unlock(&listmutex); - + spin_unlock(&listmutex); - return 0; - } - -Or, for those who prefer a side-by-side listing: - - 1 struct el { 1 struct el { - 2 struct list_head list; 2 struct list_head list; - 3 long key; 3 long key; - 4 spinlock_t mutex; 4 spinlock_t mutex; - 5 int data; 5 int data; - 6 /* Other data fields */ 6 /* Other data fields */ - 7 }; 7 }; - 8 rwlock_t listmutex; 8 spinlock_t listmutex; - 9 struct el head; 9 struct el head; - - 1 int search(long key, int *result) 1 int search(long key, int *result) - 2 { 2 { - 3 struct list_head *lp; 3 struct list_head *lp; - 4 struct el *p; 4 struct el *p; - 5 5 - 6 read_lock(&listmutex); 6 rcu_read_lock(); - 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { - 8 if (p->key == key) { 8 if (p->key == key) { - 9 *result = p->data; 9 *result = p->data; -10 read_unlock(&listmutex); 10 rcu_read_unlock(); -11 return 1; 11 return 1; -12 } 12 } -13 } 13 } -14 read_unlock(&listmutex); 14 rcu_read_unlock(); -15 return 0; 15 return 0; -16 } 16 } - - 1 int delete(long key) 1 int delete(long key) - 2 { 2 { - 3 struct el *p; 3 struct el *p; - 4 4 - 5 write_lock(&listmutex); 5 spin_lock(&listmutex); - 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { - 7 if (p->key == key) { 7 if (p->key == key) { - 8 list_del(&p->list); 8 list_del_rcu(&p->list); - 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); - 10 synchronize_rcu(); -10 kfree(p); 11 kfree(p); -11 return 1; 12 return 1; -12 } 13 } -13 } 14 } -14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); -15 return 0; 16 return 0; -16 } 17 } - -Either way, the differences are quite small. Read-side locking moves -to rcu_read_lock() and rcu_read_unlock, update-side locking moves from -a reader-writer lock to a simple spinlock, and a synchronize_rcu() -precedes the kfree(). - -However, there is one potential catch: the read-side and update-side -critical sections can now run concurrently. In many cases, this will -not be a problem, but it is necessary to check carefully regardless. -For example, if multiple independent list updates must be seen as -a single atomic update, converting to RCU will require special care. - -Also, the presence of synchronize_rcu() means that the RCU version of -delete() can now block. If this is a problem, there is a callback-based -mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can -be used in place of synchronize_rcu(). - - -7. FULL LIST OF RCU APIs - -The RCU APIs are documented in docbook-format header comments in the -Linux-kernel source code, but it helps to have a full list of the -APIs, since there does not appear to be a way to categorize them -in docbook. Here is the list, by category. - -RCU list traversal: - - list_entry_rcu - list_first_entry_rcu - list_next_rcu - list_for_each_entry_rcu - list_for_each_entry_continue_rcu - list_for_each_entry_from_rcu - hlist_first_rcu - hlist_next_rcu - hlist_pprev_rcu - hlist_for_each_entry_rcu - hlist_for_each_entry_rcu_bh - hlist_for_each_entry_from_rcu - hlist_for_each_entry_continue_rcu - hlist_for_each_entry_continue_rcu_bh - hlist_nulls_first_rcu - hlist_nulls_for_each_entry_rcu - hlist_bl_first_rcu - hlist_bl_for_each_entry_rcu - -RCU pointer/list update: - - rcu_assign_pointer - list_add_rcu - list_add_tail_rcu - list_del_rcu - list_replace_rcu - hlist_add_behind_rcu - hlist_add_before_rcu - hlist_add_head_rcu - hlist_del_rcu - hlist_del_init_rcu - hlist_replace_rcu - list_splice_init_rcu() - hlist_nulls_del_init_rcu - hlist_nulls_del_rcu - hlist_nulls_add_head_rcu - hlist_bl_add_head_rcu - hlist_bl_del_init_rcu - hlist_bl_del_rcu - hlist_bl_set_first_rcu - -RCU: Critical sections Grace period Barrier - - rcu_read_lock synchronize_net rcu_barrier - rcu_read_unlock synchronize_rcu - rcu_dereference synchronize_rcu_expedited - rcu_read_lock_held call_rcu - rcu_dereference_check kfree_rcu - rcu_dereference_protected - -bh: Critical sections Grace period Barrier - - rcu_read_lock_bh call_rcu rcu_barrier - rcu_read_unlock_bh synchronize_rcu - [local_bh_disable] synchronize_rcu_expedited - [and friends] - rcu_dereference_bh - rcu_dereference_bh_check - rcu_dereference_bh_protected - rcu_read_lock_bh_held - -sched: Critical sections Grace period Barrier - - rcu_read_lock_sched call_rcu rcu_barrier - rcu_read_unlock_sched synchronize_rcu - [preempt_disable] synchronize_rcu_expedited - [and friends] - rcu_read_lock_sched_notrace - rcu_read_unlock_sched_notrace - rcu_dereference_sched - rcu_dereference_sched_check - rcu_dereference_sched_protected - rcu_read_lock_sched_held - - -SRCU: Critical sections Grace period Barrier - - srcu_read_lock call_srcu srcu_barrier - srcu_read_unlock synchronize_srcu - srcu_dereference synchronize_srcu_expedited - srcu_dereference_check - srcu_read_lock_held - -SRCU: Initialization/cleanup - DEFINE_SRCU - DEFINE_STATIC_SRCU - init_srcu_struct - cleanup_srcu_struct - -All: lockdep-checked RCU-protected pointer access - - rcu_access_pointer - rcu_dereference_raw - RCU_LOCKDEP_WARN - rcu_sleep_check - RCU_NONIDLE - -See the comment headers in the source code (or the docbook generated -from them) for more information. - -However, given that there are no fewer than four families of RCU APIs -in the Linux kernel, how do you choose which one to use? The following -list can be helpful: - -a. Will readers need to block? If so, you need SRCU. - -b. What about the -rt patchset? If readers would need to block - in an non-rt kernel, you need SRCU. If readers would block - in a -rt kernel, but not in a non-rt kernel, SRCU is not - necessary. (The -rt patchset turns spinlocks into sleeplocks, - hence this distinction.) - -c. Do you need to treat NMI handlers, hardirq handlers, - and code segments with preemption disabled (whether - via preempt_disable(), local_irq_save(), local_bh_disable(), - or some other mechanism) as if they were explicit RCU readers? - If so, RCU-sched is the only choice that will work for you. - -d. Do you need RCU grace periods to complete even in the face - of softirq monopolization of one or more of the CPUs? For - example, is your code subject to network-based denial-of-service - attacks? If so, you should disable softirq across your readers, - for example, by using rcu_read_lock_bh(). - -e. Is your workload too update-intensive for normal use of - RCU, but inappropriate for other synchronization mechanisms? - If so, consider SLAB_TYPESAFE_BY_RCU (which was originally - named SLAB_DESTROY_BY_RCU). But please be careful! - -f. Do you need read-side critical sections that are respected - even though they are in the middle of the idle loop, during - user-mode execution, or on an offlined CPU? If so, SRCU is the - only choice that will work for you. - -g. Otherwise, use RCU. - -Of course, this all assumes that you have determined that RCU is in fact -the right tool for your job. - - -8. ANSWERS TO QUICK QUIZZES - -Quick Quiz #1: Why is this argument naive? How could a deadlock - occur when using this algorithm in a real-world Linux - kernel? [Referring to the lock-based "toy" RCU - algorithm.] - -Answer: Consider the following sequence of events: - - 1. CPU 0 acquires some unrelated lock, call it - "problematic_lock", disabling irq via - spin_lock_irqsave(). - - 2. CPU 1 enters synchronize_rcu(), write-acquiring - rcu_gp_mutex. - - 3. CPU 0 enters rcu_read_lock(), but must wait - because CPU 1 holds rcu_gp_mutex. - - 4. CPU 1 is interrupted, and the irq handler - attempts to acquire problematic_lock. - - The system is now deadlocked. - - One way to avoid this deadlock is to use an approach like - that of CONFIG_PREEMPT_RT, where all normal spinlocks - become blocking locks, and all irq handlers execute in - the context of special tasks. In this case, in step 4 - above, the irq handler would block, allowing CPU 1 to - release rcu_gp_mutex, avoiding the deadlock. - - Even in the absence of deadlock, this RCU implementation - allows latency to "bleed" from readers to other - readers through synchronize_rcu(). To see this, - consider task A in an RCU read-side critical section - (thus read-holding rcu_gp_mutex), task B blocked - attempting to write-acquire rcu_gp_mutex, and - task C blocked in rcu_read_lock() attempting to - read_acquire rcu_gp_mutex. Task A's RCU read-side - latency is holding up task C, albeit indirectly via - task B. - - Realtime RCU implementations therefore use a counter-based - approach where tasks in RCU read-side critical sections - cannot be blocked by tasks executing synchronize_rcu(). - -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. - -Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT - kernel where a routing table is used by process-context - code, but can be updated by irq-context code (for example, - by an "ICMP REDIRECT" packet). The usual way of handling - this would be to have the process-context code disable - interrupts while searching the routing table. Use of - RCU allows such interrupt-disabling to be dispensed with. - Thus, without RCU, you pay the cost of disabling interrupts, - and with RCU you don't. - - One can argue that the overhead of RCU in this - case is negative with respect to the single-CPU - interrupt-disabling approach. Others might argue that - the overhead of RCU is merely zero, and that replacing - the positive overhead of the interrupt-disabling scheme - with the zero-overhead RCU scheme does not constitute - negative overhead. - - In real life, of course, things are more complex. But - even the theoretical possibility of negative overhead for - a synchronization primitive is a bit unexpected. ;-) - -Quick Quiz #3: If it is illegal to block in an RCU read-side - critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? - -Answer: Just as PREEMPT_RT permits preemption of spinlock - critical sections, it permits preemption of RCU - read-side critical sections. It also permits - spinlocks blocking while in RCU read-side critical - sections. - - Why the apparent inconsistency? Because it is - possible to use priority boosting to keep the RCU - grace periods short if need be (for example, if running - short of memory). In contrast, if blocking waiting - for (say) network reception, there is no way to know - what should be boosted. Especially given that the - process we need to boost might well be a human being - who just went out for a pizza or something. And although - a computer-operated cattle prod might arouse serious - interest, it might also provoke serious objections. - Besides, how does the computer know what pizza parlor - the human being went to??? - - -ACKNOWLEDGEMENTS - -My thanks to the people who helped make this human-readable, including -Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern. - - -For more information, see http://www.rdrop.com/users/paulmck/RCU. -- cgit v1.2.1 From b00aedf978aa5c9a3c2d734fda5e51acfbceb5d6 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sat, 2 Nov 2019 13:31:07 +0530 Subject: doc: Convert to rcu_dereference.txt to rcu_dereference.rst This patch converts rcu_dereference.txt to rcu_dereference.rst and adds it to index.rst Signed-off-by: Amol Grover Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/rcu_dereference.rst | 463 ++++++++++++++++++++++++++++++++++ Documentation/RCU/rcu_dereference.txt | 456 --------------------------------- 3 files changed, 464 insertions(+), 456 deletions(-) create mode 100644 Documentation/RCU/rcu_dereference.rst delete mode 100644 Documentation/RCU/rcu_dereference.txt diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index b9b11481c727..c81d0e4fd999 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -8,6 +8,7 @@ RCU concepts :maxdepth: 3 arrayRCU + rcu_dereference whatisRCU rcu listRCU diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst new file mode 100644 index 000000000000..c9667eb0d444 --- /dev/null +++ b/Documentation/RCU/rcu_dereference.rst @@ -0,0 +1,463 @@ +.. _rcu_dereference_doc: + +PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() +=============================================================== + +Most of the time, you can use values from rcu_dereference() or one of +the similar primitives without worries. Dereferencing (prefix "*"), +field selection ("->"), assignment ("="), address-of ("&"), addition and +subtraction of constants, and casts all work quite naturally and safely. + +It is nevertheless possible to get into trouble with other operations. +Follow these rules to keep your RCU code working properly: + +- You must use one of the rcu_dereference() family of primitives + to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU + will complain. Worse yet, your code can see random memory-corruption + bugs due to games that compilers and DEC Alpha can play. + Without one of the rcu_dereference() primitives, compilers + can reload the value, and won't your code have fun with two + different values for a single pointer! Without rcu_dereference(), + DEC Alpha can load a pointer, dereference that pointer, and + return data preceding initialization that preceded the store of + the pointer. + + In addition, the volatile cast in rcu_dereference() prevents the + compiler from deducing the resulting pointer value. Please see + the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH" + for an example where the compiler can in fact deduce the exact + value of the pointer, and thus cause misordering. + +- You are only permitted to use rcu_dereference on pointer values. + The compiler simply knows too much about integral values to + trust it to carry dependencies through integer operations. + There are a very few exceptions, namely that you can temporarily + cast the pointer to uintptr_t in order to: + + - Set bits and clear bits down in the must-be-zero low-order + bits of that pointer. This clearly means that the pointer + must have alignment constraints, for example, this does + -not- work in general for char* pointers. + + - XOR bits to translate pointers, as is done in some + classic buddy-allocator algorithms. + + It is important to cast the value back to pointer before + doing much of anything else with it. + +- Avoid cancellation when using the "+" and "-" infix arithmetic + operators. For example, for a given variable "x", avoid + "(x-(uintptr_t)x)" for char* pointers. The compiler is within its + rights to substitute zero for this sort of expression, so that + subsequent accesses no longer depend on the rcu_dereference(), + again possibly resulting in bugs due to misordering. + + Of course, if "p" is a pointer from rcu_dereference(), and "a" + and "b" are integers that happen to be equal, the expression + "p+a-b" is safe because its value still necessarily depends on + the rcu_dereference(), thus maintaining proper ordering. + +- If you are using RCU to protect JITed functions, so that the + "()" function-invocation operator is applied to a value obtained + (directly or indirectly) from rcu_dereference(), you may need to + interact directly with the hardware to flush instruction caches. + This issue arises on some systems when a newly JITed function is + using the same memory that was used by an earlier JITed function. + +- Do not use the results from relational operators ("==", "!=", + ">", ">=", "<", or "<=") when dereferencing. For example, + the following (quite strange) code is buggy:: + + int *p; + int *q; + + ... + + p = rcu_dereference(gp) + q = &global_q; + q += p > &oom_p; + r1 = *q; /* BUGGY!!! */ + + As before, the reason this is buggy is that relational operators + are often compiled using branches. And as before, although + weak-memory machines such as ARM or PowerPC do order stores + after such branches, but can speculate loads, which can again + result in misordering bugs. + +- Be very careful about comparing pointers obtained from + rcu_dereference() against non-NULL values. As Linus Torvalds + explained, if the two pointers are equal, the compiler could + substitute the pointer you are comparing against for the pointer + obtained from rcu_dereference(). For example:: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(p->a); + + Because the compiler now knows that the value of "p" is exactly + the address of the variable "default_struct", it is free to + transform this code into the following:: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(default_struct.a); + + On ARM and Power hardware, the load from "default_struct.a" + can now be speculated, such that it might happen before the + rcu_dereference(). This could result in bugs due to misordering. + + However, comparisons are OK in the following cases: + + - The comparison was against the NULL pointer. If the + compiler knows that the pointer is NULL, you had better + not be dereferencing it anyway. If the comparison is + non-equal, the compiler is none the wiser. Therefore, + it is safe to compare pointers from rcu_dereference() + against NULL pointers. + + - The pointer is never dereferenced after being compared. + Since there are no subsequent dereferences, the compiler + cannot use anything it learned from the comparison + to reorder the non-existent subsequent dereferences. + This sort of comparison occurs frequently when scanning + RCU-protected circular linked lists. + + Note that if checks for being within an RCU read-side + critical section are not required and the pointer is never + dereferenced, rcu_access_pointer() should be used in place + of rcu_dereference(). + + - The comparison is against a pointer that references memory + that was initialized "a long time ago." The reason + this is safe is that even if misordering occurs, the + misordering will not affect the accesses that follow + the comparison. So exactly how long ago is "a long + time ago"? Here are some possibilities: + + - Compile time. + + - Boot time. + + - Module-init time for module code. + + - Prior to kthread creation for kthread code. + + - During some prior acquisition of the lock that + we now hold. + + - Before mod_timer() time for a timer handler. + + There are many other possibilities involving the Linux + kernel's wide array of primitives that cause code to + be invoked at a later time. + + - The pointer being compared against also came from + rcu_dereference(). In this case, both pointers depend + on one rcu_dereference() or another, so you get proper + ordering either way. + + That said, this situation can make certain RCU usage + bugs more likely to happen. Which can be a good thing, + at least if they happen during testing. An example + of such an RCU usage bug is shown in the section titled + "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". + + - All of the accesses following the comparison are stores, + so that a control dependency preserves the needed ordering. + That said, it is easy to get control dependencies wrong. + Please see the "CONTROL DEPENDENCIES" section of + Documentation/memory-barriers.txt for more details. + + - The pointers are not equal -and- the compiler does + not have enough information to deduce the value of the + pointer. Note that the volatile cast in rcu_dereference() + will normally prevent the compiler from knowing too much. + + However, please note that if the compiler knows that the + pointer takes on only one of two values, a not-equal + comparison will provide exactly the information that the + compiler needs to deduce the value of the pointer. + +- Disable any value-speculation optimizations that your compiler + might provide, especially if you are making use of feedback-based + optimizations that take data collected from prior runs. Such + value-speculation optimizations reorder operations by design. + + There is one exception to this rule: Value-speculation + optimizations that leverage the branch-prediction hardware are + safe on strongly ordered systems (such as x86), but not on weakly + ordered systems (such as ARM or Power). Choose your compiler + command-line options wisely! + + +EXAMPLE OF AMPLIFIED RCU-USAGE BUG +---------------------------------- + +Because updaters can run concurrently with RCU readers, RCU readers can +see stale and/or inconsistent values. If RCU readers need fresh or +consistent values, which they sometimes do, they need to take proper +precautions. To see this, consider the following code fragment:: + + struct foo { + int a; + int b; + int c; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + rcu_assign_pointer(gp1, p); + p->b = 143; + p->c = 144; + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Could get 44 on weakly order system. */ + } + do_something_with(r1, r2); + } + +You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible, +but you should not be. After all, the updater might have been invoked +a second time between the time reader() loaded into "r1" and the time +that it loaded into "r2". The fact that this same result can occur due +to some reordering from the compiler and CPUs is beside the point. + +But suppose that the reader needs a consistent view? + +Then one approach is to use locking, for example, as follows:: + + struct foo { + int a; + int b; + int c; + spinlock_t lock; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + spin_lock(&p->lock); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + spin_unlock(&p->lock); + rcu_assign_pointer(gp1, p); + spin_lock(&p->lock); + p->b = 143; + p->c = 144; + spin_unlock(&p->lock); + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + spin_lock(&p->lock); + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Locking guarantees r2 == 144. */ + } + spin_unlock(&p->lock); + do_something_with(r1, r2); + } + +As always, use the right tool for the job! + + +EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH +----------------------------------------- + +If a pointer obtained from rcu_dereference() compares not-equal to some +other pointer, the compiler normally has no clue what the value of the +first pointer might be. This lack of knowledge prevents the compiler +from carrying out optimizations that otherwise might destroy the ordering +guarantees that RCU depends on. And the volatile cast in rcu_dereference() +should prevent the compiler from guessing the value. + +But without rcu_dereference(), the compiler knows more than you might +expect. Consider the following code fragment:: + + struct foo { + int a; + int b; + }; + static struct foo variable1; + static struct foo variable2; + static struct foo *gp = &variable1; + + void updater(void) + { + initialize_foo(&variable2); + rcu_assign_pointer(gp, &variable2); + /* + * The above is the only store to gp in this translation unit, + * and the address of gp is not exported in any way. + */ + } + + int reader(void) + { + struct foo *p; + + p = gp; + barrier(); + if (p == &variable1) + return p->a; /* Must be variable1.a. */ + else + return p->b; /* Must be variable2.b. */ + } + +Because the compiler can see all stores to "gp", it knows that the only +possible values of "gp" are "variable1" on the one hand and "variable2" +on the other. The comparison in reader() therefore tells the compiler +the exact value of "p" even in the not-equals case. This allows the +compiler to make the return values independent of the load from "gp", +in turn destroying the ordering between this load and the loads of the +return values. This can result in "p->b" returning pre-initialization +garbage values. + +In short, rcu_dereference() is -not- optional when you are going to +dereference the resulting pointer. + + +WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? +------------------------------------------------------------ + +First, please avoid using rcu_dereference_raw() and also please avoid +using rcu_dereference_check() and rcu_dereference_protected() with a +second argument with a constant value of 1 (or true, for that matter). +With that caution out of the way, here is some guidance for which +member of the rcu_dereference() to use in various situations: + +1. If the access needs to be within an RCU read-side critical + section, use rcu_dereference(). With the new consolidated + RCU flavors, an RCU read-side critical section is entered + using rcu_read_lock(), anything that disables bottom halves, + anything that disables interrupts, or anything that disables + preemption. + +2. If the access might be within an RCU read-side critical section + on the one hand, or protected by (say) my_lock on the other, + use rcu_dereference_check(), for example:: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + +3. If the access might be within an RCU read-side critical section + on the one hand, or protected by either my_lock or your_lock on + the other, again use rcu_dereference_check(), for example:: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock) || + lockdep_is_held(&your_lock)); + +4. If the access is on the update side, so that it is always protected + by my_lock, use rcu_dereference_protected():: + + p1 = rcu_dereference_protected(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + This can be extended to handle multiple locks as in #3 above, + and both can be extended to check other conditions as well. + +5. If the protection is supplied by the caller, and is thus unknown + to this code, that is the rare case when rcu_dereference_raw() + is appropriate. In addition, rcu_dereference_raw() might be + appropriate when the lockdep expression would be excessively + complex, except that a better approach in that case might be to + take a long hard look at your synchronization design. Still, + there are data-locking cases where any one of a very large number + of locks or reference counters suffices to protect the pointer, + so rcu_dereference_raw() does have its place. + + However, its place is probably quite a bit smaller than one + might expect given the number of uses in the current kernel. + Ditto for its synonym, rcu_dereference_check( ... , 1), and + its close relative, rcu_dereference_protected(... , 1). + + +SPARSE CHECKING OF RCU-PROTECTED POINTERS +----------------------------------------- + +The sparse static-analysis tool checks for direct access to RCU-protected +pointers, which can result in "interesting" bugs due to compiler +optimizations involving invented loads and perhaps also load tearing. +For example, suppose someone mistakenly does something like this:: + + p = q->rcu_protected_pointer; + do_something_with(p->a); + do_something_else_with(p->b); + +If register pressure is high, the compiler might optimize "p" out +of existence, transforming the code to something like this:: + + do_something_with(q->rcu_protected_pointer->a); + do_something_else_with(q->rcu_protected_pointer->b); + +This could fatally disappoint your code if q->rcu_protected_pointer +changed in the meantime. Nor is this a theoretical problem: Exactly +this sort of bug cost Paul E. McKenney (and several of his innocent +colleagues) a three-day weekend back in the early 1990s. + +Load tearing could of course result in dereferencing a mashup of a pair +of pointers, which also might fatally disappoint your code. + +These problems could have been avoided simply by making the code instead +read as follows:: + + p = rcu_dereference(q->rcu_protected_pointer); + do_something_with(p->a); + do_something_else_with(p->b); + +Unfortunately, these sorts of bugs can be extremely hard to spot during +review. This is where the sparse tool comes into play, along with the +"__rcu" marker. If you mark a pointer declaration, whether in a structure +or as a formal parameter, with "__rcu", which tells sparse to complain if +this pointer is accessed directly. It will also cause sparse to complain +if a pointer not marked with "__rcu" is accessed using rcu_dereference() +and friends. For example, ->rcu_protected_pointer might be declared as +follows:: + + struct foo __rcu *rcu_protected_pointer; + +Use of "__rcu" is opt-in. If you choose not to use it, then you should +ignore the sparse warnings. diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt deleted file mode 100644 index bf699e8cfc75..000000000000 --- a/Documentation/RCU/rcu_dereference.txt +++ /dev/null @@ -1,456 +0,0 @@ -PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() - -Most of the time, you can use values from rcu_dereference() or one of -the similar primitives without worries. Dereferencing (prefix "*"), -field selection ("->"), assignment ("="), address-of ("&"), addition and -subtraction of constants, and casts all work quite naturally and safely. - -It is nevertheless possible to get into trouble with other operations. -Follow these rules to keep your RCU code working properly: - -o You must use one of the rcu_dereference() family of primitives - to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU - will complain. Worse yet, your code can see random memory-corruption - bugs due to games that compilers and DEC Alpha can play. - Without one of the rcu_dereference() primitives, compilers - can reload the value, and won't your code have fun with two - different values for a single pointer! Without rcu_dereference(), - DEC Alpha can load a pointer, dereference that pointer, and - return data preceding initialization that preceded the store of - the pointer. - - In addition, the volatile cast in rcu_dereference() prevents the - compiler from deducing the resulting pointer value. Please see - the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH" - for an example where the compiler can in fact deduce the exact - value of the pointer, and thus cause misordering. - -o You are only permitted to use rcu_dereference on pointer values. - The compiler simply knows too much about integral values to - trust it to carry dependencies through integer operations. - There are a very few exceptions, namely that you can temporarily - cast the pointer to uintptr_t in order to: - - o Set bits and clear bits down in the must-be-zero low-order - bits of that pointer. This clearly means that the pointer - must have alignment constraints, for example, this does - -not- work in general for char* pointers. - - o XOR bits to translate pointers, as is done in some - classic buddy-allocator algorithms. - - It is important to cast the value back to pointer before - doing much of anything else with it. - -o Avoid cancellation when using the "+" and "-" infix arithmetic - operators. For example, for a given variable "x", avoid - "(x-(uintptr_t)x)" for char* pointers. The compiler is within its - rights to substitute zero for this sort of expression, so that - subsequent accesses no longer depend on the rcu_dereference(), - again possibly resulting in bugs due to misordering. - - Of course, if "p" is a pointer from rcu_dereference(), and "a" - and "b" are integers that happen to be equal, the expression - "p+a-b" is safe because its value still necessarily depends on - the rcu_dereference(), thus maintaining proper ordering. - -o If you are using RCU to protect JITed functions, so that the - "()" function-invocation operator is applied to a value obtained - (directly or indirectly) from rcu_dereference(), you may need to - interact directly with the hardware to flush instruction caches. - This issue arises on some systems when a newly JITed function is - using the same memory that was used by an earlier JITed function. - -o Do not use the results from relational operators ("==", "!=", - ">", ">=", "<", or "<=") when dereferencing. For example, - the following (quite strange) code is buggy: - - int *p; - int *q; - - ... - - p = rcu_dereference(gp) - q = &global_q; - q += p > &oom_p; - r1 = *q; /* BUGGY!!! */ - - As before, the reason this is buggy is that relational operators - are often compiled using branches. And as before, although - weak-memory machines such as ARM or PowerPC do order stores - after such branches, but can speculate loads, which can again - result in misordering bugs. - -o Be very careful about comparing pointers obtained from - rcu_dereference() against non-NULL values. As Linus Torvalds - explained, if the two pointers are equal, the compiler could - substitute the pointer you are comparing against for the pointer - obtained from rcu_dereference(). For example: - - p = rcu_dereference(gp); - if (p == &default_struct) - do_default(p->a); - - Because the compiler now knows that the value of "p" is exactly - the address of the variable "default_struct", it is free to - transform this code into the following: - - p = rcu_dereference(gp); - if (p == &default_struct) - do_default(default_struct.a); - - On ARM and Power hardware, the load from "default_struct.a" - can now be speculated, such that it might happen before the - rcu_dereference(). This could result in bugs due to misordering. - - However, comparisons are OK in the following cases: - - o The comparison was against the NULL pointer. If the - compiler knows that the pointer is NULL, you had better - not be dereferencing it anyway. If the comparison is - non-equal, the compiler is none the wiser. Therefore, - it is safe to compare pointers from rcu_dereference() - against NULL pointers. - - o The pointer is never dereferenced after being compared. - Since there are no subsequent dereferences, the compiler - cannot use anything it learned from the comparison - to reorder the non-existent subsequent dereferences. - This sort of comparison occurs frequently when scanning - RCU-protected circular linked lists. - - Note that if checks for being within an RCU read-side - critical section are not required and the pointer is never - dereferenced, rcu_access_pointer() should be used in place - of rcu_dereference(). - - o The comparison is against a pointer that references memory - that was initialized "a long time ago." The reason - this is safe is that even if misordering occurs, the - misordering will not affect the accesses that follow - the comparison. So exactly how long ago is "a long - time ago"? Here are some possibilities: - - o Compile time. - - o Boot time. - - o Module-init time for module code. - - o Prior to kthread creation for kthread code. - - o During some prior acquisition of the lock that - we now hold. - - o Before mod_timer() time for a timer handler. - - There are many other possibilities involving the Linux - kernel's wide array of primitives that cause code to - be invoked at a later time. - - o The pointer being compared against also came from - rcu_dereference(). In this case, both pointers depend - on one rcu_dereference() or another, so you get proper - ordering either way. - - That said, this situation can make certain RCU usage - bugs more likely to happen. Which can be a good thing, - at least if they happen during testing. An example - of such an RCU usage bug is shown in the section titled - "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". - - o All of the accesses following the comparison are stores, - so that a control dependency preserves the needed ordering. - That said, it is easy to get control dependencies wrong. - Please see the "CONTROL DEPENDENCIES" section of - Documentation/memory-barriers.txt for more details. - - o The pointers are not equal -and- the compiler does - not have enough information to deduce the value of the - pointer. Note that the volatile cast in rcu_dereference() - will normally prevent the compiler from knowing too much. - - However, please note that if the compiler knows that the - pointer takes on only one of two values, a not-equal - comparison will provide exactly the information that the - compiler needs to deduce the value of the pointer. - -o Disable any value-speculation optimizations that your compiler - might provide, especially if you are making use of feedback-based - optimizations that take data collected from prior runs. Such - value-speculation optimizations reorder operations by design. - - There is one exception to this rule: Value-speculation - optimizations that leverage the branch-prediction hardware are - safe on strongly ordered systems (such as x86), but not on weakly - ordered systems (such as ARM or Power). Choose your compiler - command-line options wisely! - - -EXAMPLE OF AMPLIFIED RCU-USAGE BUG - -Because updaters can run concurrently with RCU readers, RCU readers can -see stale and/or inconsistent values. If RCU readers need fresh or -consistent values, which they sometimes do, they need to take proper -precautions. To see this, consider the following code fragment: - - struct foo { - int a; - int b; - int c; - }; - struct foo *gp1; - struct foo *gp2; - - void updater(void) - { - struct foo *p; - - p = kmalloc(...); - if (p == NULL) - deal_with_it(); - p->a = 42; /* Each field in its own cache line. */ - p->b = 43; - p->c = 44; - rcu_assign_pointer(gp1, p); - p->b = 143; - p->c = 144; - rcu_assign_pointer(gp2, p); - } - - void reader(void) - { - struct foo *p; - struct foo *q; - int r1, r2; - - p = rcu_dereference(gp2); - if (p == NULL) - return; - r1 = p->b; /* Guaranteed to get 143. */ - q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ - if (p == q) { - /* The compiler decides that q->c is same as p->c. */ - r2 = p->c; /* Could get 44 on weakly order system. */ - } - do_something_with(r1, r2); - } - -You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible, -but you should not be. After all, the updater might have been invoked -a second time between the time reader() loaded into "r1" and the time -that it loaded into "r2". The fact that this same result can occur due -to some reordering from the compiler and CPUs is beside the point. - -But suppose that the reader needs a consistent view? - -Then one approach is to use locking, for example, as follows: - - struct foo { - int a; - int b; - int c; - spinlock_t lock; - }; - struct foo *gp1; - struct foo *gp2; - - void updater(void) - { - struct foo *p; - - p = kmalloc(...); - if (p == NULL) - deal_with_it(); - spin_lock(&p->lock); - p->a = 42; /* Each field in its own cache line. */ - p->b = 43; - p->c = 44; - spin_unlock(&p->lock); - rcu_assign_pointer(gp1, p); - spin_lock(&p->lock); - p->b = 143; - p->c = 144; - spin_unlock(&p->lock); - rcu_assign_pointer(gp2, p); - } - - void reader(void) - { - struct foo *p; - struct foo *q; - int r1, r2; - - p = rcu_dereference(gp2); - if (p == NULL) - return; - spin_lock(&p->lock); - r1 = p->b; /* Guaranteed to get 143. */ - q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ - if (p == q) { - /* The compiler decides that q->c is same as p->c. */ - r2 = p->c; /* Locking guarantees r2 == 144. */ - } - spin_unlock(&p->lock); - do_something_with(r1, r2); - } - -As always, use the right tool for the job! - - -EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH - -If a pointer obtained from rcu_dereference() compares not-equal to some -other pointer, the compiler normally has no clue what the value of the -first pointer might be. This lack of knowledge prevents the compiler -from carrying out optimizations that otherwise might destroy the ordering -guarantees that RCU depends on. And the volatile cast in rcu_dereference() -should prevent the compiler from guessing the value. - -But without rcu_dereference(), the compiler knows more than you might -expect. Consider the following code fragment: - - struct foo { - int a; - int b; - }; - static struct foo variable1; - static struct foo variable2; - static struct foo *gp = &variable1; - - void updater(void) - { - initialize_foo(&variable2); - rcu_assign_pointer(gp, &variable2); - /* - * The above is the only store to gp in this translation unit, - * and the address of gp is not exported in any way. - */ - } - - int reader(void) - { - struct foo *p; - - p = gp; - barrier(); - if (p == &variable1) - return p->a; /* Must be variable1.a. */ - else - return p->b; /* Must be variable2.b. */ - } - -Because the compiler can see all stores to "gp", it knows that the only -possible values of "gp" are "variable1" on the one hand and "variable2" -on the other. The comparison in reader() therefore tells the compiler -the exact value of "p" even in the not-equals case. This allows the -compiler to make the return values independent of the load from "gp", -in turn destroying the ordering between this load and the loads of the -return values. This can result in "p->b" returning pre-initialization -garbage values. - -In short, rcu_dereference() is -not- optional when you are going to -dereference the resulting pointer. - - -WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? - -First, please avoid using rcu_dereference_raw() and also please avoid -using rcu_dereference_check() and rcu_dereference_protected() with a -second argument with a constant value of 1 (or true, for that matter). -With that caution out of the way, here is some guidance for which -member of the rcu_dereference() to use in various situations: - -1. If the access needs to be within an RCU read-side critical - section, use rcu_dereference(). With the new consolidated - RCU flavors, an RCU read-side critical section is entered - using rcu_read_lock(), anything that disables bottom halves, - anything that disables interrupts, or anything that disables - preemption. - -2. If the access might be within an RCU read-side critical section - on the one hand, or protected by (say) my_lock on the other, - use rcu_dereference_check(), for example: - - p1 = rcu_dereference_check(p->rcu_protected_pointer, - lockdep_is_held(&my_lock)); - - -3. If the access might be within an RCU read-side critical section - on the one hand, or protected by either my_lock or your_lock on - the other, again use rcu_dereference_check(), for example: - - p1 = rcu_dereference_check(p->rcu_protected_pointer, - lockdep_is_held(&my_lock) || - lockdep_is_held(&your_lock)); - -4. If the access is on the update side, so that it is always protected - by my_lock, use rcu_dereference_protected(): - - p1 = rcu_dereference_protected(p->rcu_protected_pointer, - lockdep_is_held(&my_lock)); - - This can be extended to handle multiple locks as in #3 above, - and both can be extended to check other conditions as well. - -5. If the protection is supplied by the caller, and is thus unknown - to this code, that is the rare case when rcu_dereference_raw() - is appropriate. In addition, rcu_dereference_raw() might be - appropriate when the lockdep expression would be excessively - complex, except that a better approach in that case might be to - take a long hard look at your synchronization design. Still, - there are data-locking cases where any one of a very large number - of locks or reference counters suffices to protect the pointer, - so rcu_dereference_raw() does have its place. - - However, its place is probably quite a bit smaller than one - might expect given the number of uses in the current kernel. - Ditto for its synonym, rcu_dereference_check( ... , 1), and - its close relative, rcu_dereference_protected(... , 1). - - -SPARSE CHECKING OF RCU-PROTECTED POINTERS - -The sparse static-analysis tool checks for direct access to RCU-protected -pointers, which can result in "interesting" bugs due to compiler -optimizations involving invented loads and perhaps also load tearing. -For example, suppose someone mistakenly does something like this: - - p = q->rcu_protected_pointer; - do_something_with(p->a); - do_something_else_with(p->b); - -If register pressure is high, the compiler might optimize "p" out -of existence, transforming the code to something like this: - - do_something_with(q->rcu_protected_pointer->a); - do_something_else_with(q->rcu_protected_pointer->b); - -This could fatally disappoint your code if q->rcu_protected_pointer -changed in the meantime. Nor is this a theoretical problem: Exactly -this sort of bug cost Paul E. McKenney (and several of his innocent -colleagues) a three-day weekend back in the early 1990s. - -Load tearing could of course result in dereferencing a mashup of a pair -of pointers, which also might fatally disappoint your code. - -These problems could have been avoided simply by making the code instead -read as follows: - - p = rcu_dereference(q->rcu_protected_pointer); - do_something_with(p->a); - do_something_else_with(p->b); - -Unfortunately, these sorts of bugs can be extremely hard to spot during -review. This is where the sparse tool comes into play, along with the -"__rcu" marker. If you mark a pointer declaration, whether in a structure -or as a formal parameter, with "__rcu", which tells sparse to complain if -this pointer is accessed directly. It will also cause sparse to complain -if a pointer not marked with "__rcu" is accessed using rcu_dereference() -and friends. For example, ->rcu_protected_pointer might be declared as -follows: - - struct foo __rcu *rcu_protected_pointer; - -Use of "__rcu" is opt-in. If you choose not to use it, then you should -ignore the sparse warnings. -- cgit v1.2.1 From 4af498306ffd8e29ed5c1ae544d01bc8c09c3f8e Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 7 Nov 2019 12:02:41 +0530 Subject: doc: Convert to rcubarrier.txt to ReST Convert rcubarrier.txt to rcubarrier.rst and add it to index.rst. Format file according to reST - Add headings and sub-headings - Add code segments - Add cross-references to quizes and answers Signed-off-by: Amol Grover Tested-by: Phong Tran Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/rcubarrier.rst | 353 +++++++++++++++++++++++++++++++++++++++ Documentation/RCU/rcubarrier.txt | 325 ----------------------------------- 3 files changed, 354 insertions(+), 325 deletions(-) create mode 100644 Documentation/RCU/rcubarrier.rst delete mode 100644 Documentation/RCU/rcubarrier.txt diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index c81d0e4fd999..81a0a1e5f767 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -8,6 +8,7 @@ RCU concepts :maxdepth: 3 arrayRCU + rcubarrier rcu_dereference whatisRCU rcu diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst new file mode 100644 index 000000000000..f64f4413a47c --- /dev/null +++ b/Documentation/RCU/rcubarrier.rst @@ -0,0 +1,353 @@ +.. _rcu_barrier: + +RCU and Unloadable Modules +========================== + +[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] + +RCU (read-copy update) is a synchronization mechanism that can be thought +of as a replacement for read-writer locking (among other things), but with +very low-overhead readers that are immune to deadlock, priority inversion, +and unbounded latency. RCU read-side critical sections are delimited +by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT +kernels, generate no code whatsoever. + +This means that RCU writers are unaware of the presence of concurrent +readers, so that RCU updates to shared data must be undertaken quite +carefully, leaving an old version of the data structure in place until all +pre-existing readers have finished. These old versions are needed because +such readers might hold a reference to them. RCU updates can therefore be +rather expensive, and RCU is thus best suited for read-mostly situations. + +How can an RCU writer possibly determine when all readers are finished, +given that readers might well leave absolutely no trace of their +presence? There is a synchronize_rcu() primitive that blocks until all +pre-existing readers have completed. An updater wishing to delete an +element p from a linked list might do the following, while holding an +appropriate lock, of course:: + + list_del_rcu(p); + synchronize_rcu(); + kfree(p); + +But the above code cannot be used in IRQ context -- the call_rcu() +primitive must be used instead. This primitive takes a pointer to an +rcu_head struct placed within the RCU-protected data structure and +another pointer to a function that may be invoked later to free that +structure. Code to delete an element p from the linked list from IRQ +context might then be as follows:: + + list_del_rcu(p); + call_rcu(&p->rcu, p_callback); + +Since call_rcu() never blocks, this code can safely be used from within +IRQ context. The function p_callback() might be defined as follows:: + + static void p_callback(struct rcu_head *rp) + { + struct pstruct *p = container_of(rp, struct pstruct, rcu); + + kfree(p); + } + + +Unloading Modules That Use call_rcu() +------------------------------------- + +But what if p_callback is defined in an unloadable module? + +If we unload the module while some RCU callbacks are pending, +the CPUs executing these callbacks are going to be severely +disappointed when they are later invoked, as fancifully depicted at +http://lwn.net/images/ns/kernel/rcu-drop.jpg. + +We could try placing a synchronize_rcu() in the module-exit code path, +but this is not sufficient. Although synchronize_rcu() does wait for a +grace period to elapse, it does not wait for the callbacks to complete. + +One might be tempted to try several back-to-back synchronize_rcu() +calls, but this is still not guaranteed to work. If there is a very +heavy RCU-callback load, then some of the callbacks might be deferred +in order to allow other processing to proceed. Such deferral is required +in realtime kernels in order to avoid excessive scheduling latencies. + + +rcu_barrier() +------------- + +We instead need the rcu_barrier() primitive. Rather than waiting for +a grace period to elapse, rcu_barrier() waits for all outstanding RCU +callbacks to complete. Please note that rcu_barrier() does **not** imply +synchronize_rcu(), in particular, if there are no RCU callbacks queued +anywhere, rcu_barrier() is within its rights to return immediately, +without waiting for a grace period to elapse. + +Pseudo-code using rcu_barrier() is as follows: + + 1. Prevent any new RCU callbacks from being posted. + 2. Execute rcu_barrier(). + 3. Allow the module to be unloaded. + +There is also an srcu_barrier() function for SRCU, and you of course +must match the flavor of rcu_barrier() with that of call_rcu(). If your +module uses multiple flavors of call_rcu(), then it must also use multiple +flavors of rcu_barrier() when unloading that module. For example, if +it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on +srcu_struct_2, then the following three lines of code will be required +when unloading:: + + 1 rcu_barrier(); + 2 srcu_barrier(&srcu_struct_1); + 3 srcu_barrier(&srcu_struct_2); + +The rcutorture module makes use of rcu_barrier() in its exit function +as follows:: + + 1 static void + 2 rcu_torture_cleanup(void) + 3 { + 4 int i; + 5 + 6 fullstop = 1; + 7 if (shuffler_task != NULL) { + 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + 9 kthread_stop(shuffler_task); + 10 } + 11 shuffler_task = NULL; + 12 + 13 if (writer_task != NULL) { + 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + 15 kthread_stop(writer_task); + 16 } + 17 writer_task = NULL; + 18 + 19 if (reader_tasks != NULL) { + 20 for (i = 0; i < nrealreaders; i++) { + 21 if (reader_tasks[i] != NULL) { + 22 VERBOSE_PRINTK_STRING( + 23 "Stopping rcu_torture_reader task"); + 24 kthread_stop(reader_tasks[i]); + 25 } + 26 reader_tasks[i] = NULL; + 27 } + 28 kfree(reader_tasks); + 29 reader_tasks = NULL; + 30 } + 31 rcu_torture_current = NULL; + 32 + 33 if (fakewriter_tasks != NULL) { + 34 for (i = 0; i < nfakewriters; i++) { + 35 if (fakewriter_tasks[i] != NULL) { + 36 VERBOSE_PRINTK_STRING( + 37 "Stopping rcu_torture_fakewriter task"); + 38 kthread_stop(fakewriter_tasks[i]); + 39 } + 40 fakewriter_tasks[i] = NULL; + 41 } + 42 kfree(fakewriter_tasks); + 43 fakewriter_tasks = NULL; + 44 } + 45 + 46 if (stats_task != NULL) { + 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + 48 kthread_stop(stats_task); + 49 } + 50 stats_task = NULL; + 51 + 52 /* Wait for all RCU callbacks to fire. */ + 53 rcu_barrier(); + 54 + 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + 56 + 57 if (cur_ops->cleanup != NULL) + 58 cur_ops->cleanup(); + 59 if (atomic_read(&n_rcu_torture_error)) + 60 rcu_torture_print_module_parms("End of test: FAILURE"); + 61 else + 62 rcu_torture_print_module_parms("End of test: SUCCESS"); + 63 } + +Line 6 sets a global variable that prevents any RCU callbacks from +re-posting themselves. This will not be necessary in most cases, since +RCU callbacks rarely include calls to call_rcu(). However, the rcutorture +module is an exception to this rule, and therefore needs to set this +global variable. + +Lines 7-50 stop all the kernel tasks associated with the rcutorture +module. Therefore, once execution reaches line 53, no more rcutorture +RCU callbacks will be posted. The rcu_barrier() call on line 53 waits +for any pre-existing callbacks to complete. + +Then lines 55-62 print status and do operation-specific cleanup, and +then return, permitting the module-unload operation to be completed. + +.. _rcubarrier_quiz_1: + +Quick Quiz #1: + Is there any other situation where rcu_barrier() might + be required? + +:ref:`Answer to Quick Quiz #1 ` + +Your module might have additional complications. For example, if your +module invokes call_rcu() from timers, you will need to first cancel all +the timers, and only then invoke rcu_barrier() to wait for any remaining +RCU callbacks to complete. + +Of course, if you module uses call_rcu(), you will need to invoke +rcu_barrier() before unloading. Similarly, if your module uses +call_srcu(), you will need to invoke srcu_barrier() before unloading, +and on the same srcu_struct structure. If your module uses call_rcu() +**and** call_srcu(), then you will need to invoke rcu_barrier() **and** +srcu_barrier(). + + +Implementing rcu_barrier() +-------------------------- + +Dipankar Sarma's implementation of rcu_barrier() makes use of the fact +that RCU callbacks are never reordered once queued on one of the per-CPU +queues. His implementation queues an RCU callback on each of the per-CPU +callback queues, and then waits until they have all started executing, at +which point, all earlier RCU callbacks are guaranteed to have completed. + +The original code for rcu_barrier() was as follows:: + + 1 void rcu_barrier(void) + 2 { + 3 BUG_ON(in_interrupt()); + 4 /* Take cpucontrol mutex to protect against CPU hotplug */ + 5 mutex_lock(&rcu_barrier_mutex); + 6 init_completion(&rcu_barrier_completion); + 7 atomic_set(&rcu_barrier_cpu_count, 0); + 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); + 9 wait_for_completion(&rcu_barrier_completion); + 10 mutex_unlock(&rcu_barrier_mutex); + 11 } + +Line 3 verifies that the caller is in process context, and lines 5 and 10 +use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the +global completion and counters at a time, which are initialized on lines +6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is +shown below. Note that the final "1" in on_each_cpu()'s argument list +ensures that all the calls to rcu_barrier_func() will have completed +before on_each_cpu() returns. Line 9 then waits for the completion. + +This code was rewritten in 2008 and several times thereafter, but this +still gives the general idea. + +The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() +to post an RCU callback, as follows:: + + 1 static void rcu_barrier_func(void *notused) + 2 { + 3 int cpu = smp_processor_id(); + 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + 5 struct rcu_head *head; + 6 + 7 head = &rdp->barrier; + 8 atomic_inc(&rcu_barrier_cpu_count); + 9 call_rcu(head, rcu_barrier_callback); + 10 } + +Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, +which contains the struct rcu_head that needed for the later call to +call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line +8 increments a global counter. This counter will later be decremented +by the callback. Line 9 then registers the rcu_barrier_callback() on +the current CPU's queue. + +The rcu_barrier_callback() function simply atomically decrements the +rcu_barrier_cpu_count variable and finalizes the completion when it +reaches zero, as follows:: + + 1 static void rcu_barrier_callback(struct rcu_head *notused) + 2 { + 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + 4 complete(&rcu_barrier_completion); + 5 } + +.. _rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + +:ref:`Answer to Quick Quiz #2 ` + +The current rcu_barrier() implementation is more complex, due to the need +to avoid disturbing idle CPUs (especially on battery-powered systems) +and the need to minimally disturb non-idle CPUs in real-time systems. +However, the code above illustrates the concepts. + + +rcu_barrier() Summary +--------------------- + +The rcu_barrier() primitive has seen relatively little use, since most +code using RCU is in the core kernel rather than in modules. However, if +you are using RCU from an unloadable module, you need to use rcu_barrier() +so that your module may be safely unloaded. + + +Answers to Quick Quizzes +------------------------ + +.. _answer_rcubarrier_quiz_1: + +Quick Quiz #1: + Is there any other situation where rcu_barrier() might + be required? + +Answer: Interestingly enough, rcu_barrier() was not originally + implemented for module unloading. Nikita Danilov was using + RCU in a filesystem, which resulted in a similar situation at + filesystem-unmount time. Dipankar Sarma coded up rcu_barrier() + in response, so that Nikita could invoke it during the + filesystem-unmount process. + + Much later, yours truly hit the RCU module-unload problem when + implementing rcutorture, and found that rcu_barrier() solves + this problem as well. + +:ref:`Back to Quick Quiz #1 ` + +.. _answer_rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + +Answer: This cannot happen. The reason is that on_each_cpu() has its last + argument, the wait flag, set to "1". This flag is passed through + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent + a grace period from completing on non-CONFIG_PREEMPT kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is + of no use in CONFIG_PREEMPT kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to + rcu_barrier_func(). This prevents the local CPU from context + switching, again preventing grace periods from completing. This + means that all CPUs have executed rcu_barrier_func() before + the first rcu_barrier_callback() can possibly execute, in turn + preventing rcu_barrier_cpu_count from prematurely reaching zero. + + Currently, -rt implementations of RCU keep but a single global + queue for RCU callbacks, and thus do not suffer from this + problem. However, when the -rt RCU eventually does have per-CPU + callback queues, things will have to change. One simple change + is to add an rcu_read_lock() before line 8 of rcu_barrier() + and an rcu_read_unlock() after line 8 of this same function. If + you can think of a better change, please let me know! + +:ref:`Back to Quick Quiz #2 ` diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt deleted file mode 100644 index a2782df69732..000000000000 --- a/Documentation/RCU/rcubarrier.txt +++ /dev/null @@ -1,325 +0,0 @@ -RCU and Unloadable Modules - -[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] - -RCU (read-copy update) is a synchronization mechanism that can be thought -of as a replacement for read-writer locking (among other things), but with -very low-overhead readers that are immune to deadlock, priority inversion, -and unbounded latency. RCU read-side critical sections are delimited -by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT -kernels, generate no code whatsoever. - -This means that RCU writers are unaware of the presence of concurrent -readers, so that RCU updates to shared data must be undertaken quite -carefully, leaving an old version of the data structure in place until all -pre-existing readers have finished. These old versions are needed because -such readers might hold a reference to them. RCU updates can therefore be -rather expensive, and RCU is thus best suited for read-mostly situations. - -How can an RCU writer possibly determine when all readers are finished, -given that readers might well leave absolutely no trace of their -presence? There is a synchronize_rcu() primitive that blocks until all -pre-existing readers have completed. An updater wishing to delete an -element p from a linked list might do the following, while holding an -appropriate lock, of course: - - list_del_rcu(p); - synchronize_rcu(); - kfree(p); - -But the above code cannot be used in IRQ context -- the call_rcu() -primitive must be used instead. This primitive takes a pointer to an -rcu_head struct placed within the RCU-protected data structure and -another pointer to a function that may be invoked later to free that -structure. Code to delete an element p from the linked list from IRQ -context might then be as follows: - - list_del_rcu(p); - call_rcu(&p->rcu, p_callback); - -Since call_rcu() never blocks, this code can safely be used from within -IRQ context. The function p_callback() might be defined as follows: - - static void p_callback(struct rcu_head *rp) - { - struct pstruct *p = container_of(rp, struct pstruct, rcu); - - kfree(p); - } - - -Unloading Modules That Use call_rcu() - -But what if p_callback is defined in an unloadable module? - -If we unload the module while some RCU callbacks are pending, -the CPUs executing these callbacks are going to be severely -disappointed when they are later invoked, as fancifully depicted at -http://lwn.net/images/ns/kernel/rcu-drop.jpg. - -We could try placing a synchronize_rcu() in the module-exit code path, -but this is not sufficient. Although synchronize_rcu() does wait for a -grace period to elapse, it does not wait for the callbacks to complete. - -One might be tempted to try several back-to-back synchronize_rcu() -calls, but this is still not guaranteed to work. If there is a very -heavy RCU-callback load, then some of the callbacks might be deferred -in order to allow other processing to proceed. Such deferral is required -in realtime kernels in order to avoid excessive scheduling latencies. - - -rcu_barrier() - -We instead need the rcu_barrier() primitive. Rather than waiting for -a grace period to elapse, rcu_barrier() waits for all outstanding RCU -callbacks to complete. Please note that rcu_barrier() does -not- imply -synchronize_rcu(), in particular, if there are no RCU callbacks queued -anywhere, rcu_barrier() is within its rights to return immediately, -without waiting for a grace period to elapse. - -Pseudo-code using rcu_barrier() is as follows: - - 1. Prevent any new RCU callbacks from being posted. - 2. Execute rcu_barrier(). - 3. Allow the module to be unloaded. - -There is also an srcu_barrier() function for SRCU, and you of course -must match the flavor of rcu_barrier() with that of call_rcu(). If your -module uses multiple flavors of call_rcu(), then it must also use multiple -flavors of rcu_barrier() when unloading that module. For example, if -it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on -srcu_struct_2(), then the following three lines of code will be required -when unloading: - - 1 rcu_barrier(); - 2 srcu_barrier(&srcu_struct_1); - 3 srcu_barrier(&srcu_struct_2); - -The rcutorture module makes use of rcu_barrier() in its exit function -as follows: - - 1 static void - 2 rcu_torture_cleanup(void) - 3 { - 4 int i; - 5 - 6 fullstop = 1; - 7 if (shuffler_task != NULL) { - 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - 9 kthread_stop(shuffler_task); -10 } -11 shuffler_task = NULL; -12 -13 if (writer_task != NULL) { -14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); -15 kthread_stop(writer_task); -16 } -17 writer_task = NULL; -18 -19 if (reader_tasks != NULL) { -20 for (i = 0; i < nrealreaders; i++) { -21 if (reader_tasks[i] != NULL) { -22 VERBOSE_PRINTK_STRING( -23 "Stopping rcu_torture_reader task"); -24 kthread_stop(reader_tasks[i]); -25 } -26 reader_tasks[i] = NULL; -27 } -28 kfree(reader_tasks); -29 reader_tasks = NULL; -30 } -31 rcu_torture_current = NULL; -32 -33 if (fakewriter_tasks != NULL) { -34 for (i = 0; i < nfakewriters; i++) { -35 if (fakewriter_tasks[i] != NULL) { -36 VERBOSE_PRINTK_STRING( -37 "Stopping rcu_torture_fakewriter task"); -38 kthread_stop(fakewriter_tasks[i]); -39 } -40 fakewriter_tasks[i] = NULL; -41 } -42 kfree(fakewriter_tasks); -43 fakewriter_tasks = NULL; -44 } -45 -46 if (stats_task != NULL) { -47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); -48 kthread_stop(stats_task); -49 } -50 stats_task = NULL; -51 -52 /* Wait for all RCU callbacks to fire. */ -53 rcu_barrier(); -54 -55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ -56 -57 if (cur_ops->cleanup != NULL) -58 cur_ops->cleanup(); -59 if (atomic_read(&n_rcu_torture_error)) -60 rcu_torture_print_module_parms("End of test: FAILURE"); -61 else -62 rcu_torture_print_module_parms("End of test: SUCCESS"); -63 } - -Line 6 sets a global variable that prevents any RCU callbacks from -re-posting themselves. This will not be necessary in most cases, since -RCU callbacks rarely include calls to call_rcu(). However, the rcutorture -module is an exception to this rule, and therefore needs to set this -global variable. - -Lines 7-50 stop all the kernel tasks associated with the rcutorture -module. Therefore, once execution reaches line 53, no more rcutorture -RCU callbacks will be posted. The rcu_barrier() call on line 53 waits -for any pre-existing callbacks to complete. - -Then lines 55-62 print status and do operation-specific cleanup, and -then return, permitting the module-unload operation to be completed. - -Quick Quiz #1: Is there any other situation where rcu_barrier() might - be required? - -Your module might have additional complications. For example, if your -module invokes call_rcu() from timers, you will need to first cancel all -the timers, and only then invoke rcu_barrier() to wait for any remaining -RCU callbacks to complete. - -Of course, if you module uses call_rcu(), you will need to invoke -rcu_barrier() before unloading. Similarly, if your module uses -call_srcu(), you will need to invoke srcu_barrier() before unloading, -and on the same srcu_struct structure. If your module uses call_rcu() --and- call_srcu(), then you will need to invoke rcu_barrier() -and- -srcu_barrier(). - - -Implementing rcu_barrier() - -Dipankar Sarma's implementation of rcu_barrier() makes use of the fact -that RCU callbacks are never reordered once queued on one of the per-CPU -queues. His implementation queues an RCU callback on each of the per-CPU -callback queues, and then waits until they have all started executing, at -which point, all earlier RCU callbacks are guaranteed to have completed. - -The original code for rcu_barrier() was as follows: - - 1 void rcu_barrier(void) - 2 { - 3 BUG_ON(in_interrupt()); - 4 /* Take cpucontrol mutex to protect against CPU hotplug */ - 5 mutex_lock(&rcu_barrier_mutex); - 6 init_completion(&rcu_barrier_completion); - 7 atomic_set(&rcu_barrier_cpu_count, 0); - 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); - 9 wait_for_completion(&rcu_barrier_completion); -10 mutex_unlock(&rcu_barrier_mutex); -11 } - -Line 3 verifies that the caller is in process context, and lines 5 and 10 -use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the -global completion and counters at a time, which are initialized on lines -6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is -shown below. Note that the final "1" in on_each_cpu()'s argument list -ensures that all the calls to rcu_barrier_func() will have completed -before on_each_cpu() returns. Line 9 then waits for the completion. - -This code was rewritten in 2008 and several times thereafter, but this -still gives the general idea. - -The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() -to post an RCU callback, as follows: - - 1 static void rcu_barrier_func(void *notused) - 2 { - 3 int cpu = smp_processor_id(); - 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - 5 struct rcu_head *head; - 6 - 7 head = &rdp->barrier; - 8 atomic_inc(&rcu_barrier_cpu_count); - 9 call_rcu(head, rcu_barrier_callback); -10 } - -Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, -which contains the struct rcu_head that needed for the later call to -call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line -8 increments a global counter. This counter will later be decremented -by the callback. Line 9 then registers the rcu_barrier_callback() on -the current CPU's queue. - -The rcu_barrier_callback() function simply atomically decrements the -rcu_barrier_cpu_count variable and finalizes the completion when it -reaches zero, as follows: - - 1 static void rcu_barrier_callback(struct rcu_head *notused) - 2 { - 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - 4 complete(&rcu_barrier_completion); - 5 } - -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes - immediately (thus incrementing rcu_barrier_cpu_count to the - value one), but the other CPU's rcu_barrier_func() invocations - are delayed for a full grace period? Couldn't this result in - rcu_barrier() returning prematurely? - -The current rcu_barrier() implementation is more complex, due to the need -to avoid disturbing idle CPUs (especially on battery-powered systems) -and the need to minimally disturb non-idle CPUs in real-time systems. -However, the code above illustrates the concepts. - - -rcu_barrier() Summary - -The rcu_barrier() primitive has seen relatively little use, since most -code using RCU is in the core kernel rather than in modules. However, if -you are using RCU from an unloadable module, you need to use rcu_barrier() -so that your module may be safely unloaded. - - -Answers to Quick Quizzes - -Quick Quiz #1: Is there any other situation where rcu_barrier() might - be required? - -Answer: Interestingly enough, rcu_barrier() was not originally - implemented for module unloading. Nikita Danilov was using - RCU in a filesystem, which resulted in a similar situation at - filesystem-unmount time. Dipankar Sarma coded up rcu_barrier() - in response, so that Nikita could invoke it during the - filesystem-unmount process. - - Much later, yours truly hit the RCU module-unload problem when - implementing rcutorture, and found that rcu_barrier() solves - this problem as well. - -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes - immediately (thus incrementing rcu_barrier_cpu_count to the - value one), but the other CPU's rcu_barrier_func() invocations - are delayed for a full grace period? Couldn't this result in - rcu_barrier() returning prematurely? - -Answer: This cannot happen. The reason is that on_each_cpu() has its last - argument, the wait flag, set to "1". This flag is passed through - to smp_call_function() and further to smp_call_function_on_cpu(), - causing this latter to spin until the cross-CPU invocation of - rcu_barrier_func() has completed. This by itself would prevent - a grace period from completing on non-CONFIG_PREEMPT kernels, - since each CPU must undergo a context switch (or other quiescent - state) before the grace period can complete. However, this is - of no use in CONFIG_PREEMPT kernels. - - Therefore, on_each_cpu() disables preemption across its call - to smp_call_function() and also across the local call to - rcu_barrier_func(). This prevents the local CPU from context - switching, again preventing grace periods from completing. This - means that all CPUs have executed rcu_barrier_func() before - the first rcu_barrier_callback() can possibly execute, in turn - preventing rcu_barrier_cpu_count from prematurely reaching zero. - - Currently, -rt implementations of RCU keep but a single global - queue for RCU callbacks, and thus do not suffer from this - problem. However, when the -rt RCU eventually does have per-CPU - callback queues, things will have to change. One simple change - is to add an rcu_read_lock() before line 8 of rcu_barrier() - and an rcu_read_unlock() after line 8 of this same function. If - you can think of a better change, please let me know! -- cgit v1.2.1 From 17f0da13873ba393a72f14d41ffc8ff388e38723 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 11 Nov 2019 23:41:22 +0530 Subject: doc: Updated full list of RCU API in whatisRCU.rst This patch updates the list of RCU API in whatisRCU.rst. Signed-off-by: Madhuparna Bhowmik Tested-by: Amol Grover Tested-by: Phong Tran Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 2f6f6ebbc8b0..c7f147b8034f 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -884,11 +884,14 @@ in docbook. Here is the list, by category. RCU list traversal:: list_entry_rcu + list_entry_lockless list_first_entry_rcu list_next_rcu list_for_each_entry_rcu list_for_each_entry_continue_rcu list_for_each_entry_from_rcu + list_first_or_null_rcu + list_next_or_null_rcu hlist_first_rcu hlist_next_rcu hlist_pprev_rcu @@ -902,7 +905,7 @@ RCU list traversal:: hlist_bl_first_rcu hlist_bl_for_each_entry_rcu -RCU pointer/list udate:: +RCU pointer/list update:: rcu_assign_pointer list_add_rcu @@ -912,10 +915,12 @@ RCU pointer/list udate:: hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu + hlist_add_tail_rcu hlist_del_rcu hlist_del_init_rcu hlist_replace_rcu - list_splice_init_rcu() + list_splice_init_rcu + list_splice_tail_init_rcu hlist_nulls_del_init_rcu hlist_nulls_del_rcu hlist_nulls_add_head_rcu -- cgit v1.2.1 From 6e6eca2ee79a23329093cdf8d1cc0bd86128981f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2019 09:12:59 -0800 Subject: doc: Fix typo "deference" to "dereference" Reported-by: Jens Axboe Signed-off-by: Paul E. McKenney --- Documentation/RCU/lockdep-splat.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt index 9c015976b174..b8096316fd11 100644 --- a/Documentation/RCU/lockdep-splat.txt +++ b/Documentation/RCU/lockdep-splat.txt @@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU read-side critical section, which again would have suppressed the above lockdep-RCU splat. -But in this particular case, we don't actually deference the pointer +But in this particular case, we don't actually dereference the pointer returned from rcu_dereference(). Instead, that pointer is just compared to the cic pointer, which means that the rcu_dereference() can be replaced by rcu_access_pointer() as follows: -- cgit v1.2.1 From 1a271ebbfe33a44f61e02d35a2950ab00b32850b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Dec 2019 19:13:45 -0800 Subject: net/tipc: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney [ paulmck: Updated based on Ying Xue and Tuong Lien Tong feedback. ] Cc: Jon Maloy Cc: Ying Xue Cc: "David S. Miller" Cc: Cc: --- net/tipc/crypto.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 990a872cec46..c8c47fc72653 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) -#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) - #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ @@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) /* Move passive key if any */ if (key.passive) { - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } -- cgit v1.2.1 From a191c9e9f73a78e8801b5eeb3d43bbd6fd73b86f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2019 10:30:21 -0800 Subject: wireless/mediatek: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds Reported-by: "Martin K. Petersen" [ paulmck: Apply Matthias Brugger feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: "Martin K. Petersen" Acked-by: Kalle Valo Cc: Felix Fietkau Cc: Lorenzo Bianconi Cc: Ryder Lee Cc: Roy Luo Cc: "David S. Miller" Cc: Matthias Brugger Cc: Cc: Cc: Cc: --- drivers/net/wireless/mediatek/mt76/agg-rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c index 53b5a4b2dcc5..59c187898132 100644 --- a/drivers/net/wireless/mediatek/mt76/agg-rx.c +++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c @@ -281,8 +281,8 @@ void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno) { struct mt76_rx_tid *tid = NULL; - rcu_swap_protected(wcid->aggr[tidno], tid, - lockdep_is_held(&dev->mutex)); + tid = rcu_replace_pointer(wcid->aggr[tidno], tid, + lockdep_is_held(&dev->mutex)); if (tid) { mt76_rx_aggr_shutdown(dev, tid); kfree_rcu(tid, rcu_head); -- cgit v1.2.1 From 4414abf89158d734a83c99f6504f648417bd9550 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 16:31:42 -0700 Subject: rcu: Remove rcu_swap_protected() Now that the calls to rcu_swap_protected() have been replaced by rcu_replace_pointer(), this commit removes rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Shane M Seymour Cc: Martin K. Petersen --- include/linux/rcupdate.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0b7506330c87..fe470243acdd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -400,22 +400,6 @@ do { \ __tmp; \ }) -/** - * rcu_swap_protected() - swap an RCU and a regular pointer - * @rcu_ptr: RCU pointer - * @ptr: regular pointer - * @c: the conditions under which the dereference will take place - * - * Perform swap(@rcu_ptr, @ptr) where @rcu_ptr is an RCU-annotated pointer and - * @c is the argument that is passed to the rcu_dereference_protected() call - * used to read that pointer. - */ -#define rcu_swap_protected(rcu_ptr, ptr, c) do { \ - typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ - rcu_assign_pointer((rcu_ptr), (ptr)); \ - (ptr) = __tmp; \ -} while (0) - /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read -- cgit v1.2.1 From c30fe541896440667bd9e9068aedd1d440fbbcd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Oct 2019 21:40:09 -0700 Subject: rcu: Mark non-global functions and variables as static Each of rcu_state, rcu_rnp_online_cpus(), rcu_dynticks_curr_cpu_in_eqs(), and rcu_dynticks_snap() are used only in the kernel/rcu/tree.o translation unit, and may thus be marked static. This commit therefore makes this change. Reported-by: Ben Dooks Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..dd8cfc34f4da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..e4dc5debfc84 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -403,8 +403,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); -- cgit v1.2.1 From e2bb3dbfa74571140ef41887011934f5d5e6ed93 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:56 +0000 Subject: rcu: Fix harmless omission of "CONFIG_" from #if condition The C preprocessor macros SRCU and TINY_RCU should instead be CONFIG_SRCU and CONFIG_TINY_RCU, respectively in the #f in kernel/rcu/rcu.h. But there is no harm when "TINY_RCU" is wrongly used, which are always non-defined, which makes "!defined(TINY_RCU)" always true, which means the code block is always included, and the included code block doesn't cause any compilation error so far in CONFIG_TINY_RCU builds. It is also the reason this change should not be taken in -stable. This commit adds the needed "CONFIG_" prefix to both macros. Not for -stable. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..473259422547 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -281,7 +281,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(SRCU) || !defined(TINY_RCU) +#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) #include @@ -418,7 +418,7 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_SRCU void srcu_init(void); -- cgit v1.2.1 From f31d97a56482154dc5c3792e5f5a1037cd1da048 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:57 +0000 Subject: rcu: Fix tracepoint tracking RCU CPU kthread utilization In the call to trace_rcu_utilization() at the start of the loop in rcu_cpu_kthread(), "rcu_wait" is incorrect, plus this trace event needs to be hoisted above the loop to balance with either the "rcu_wait" or "rcu_yield", depending on how the loop exits. This commit therefore makes these changes. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd8cfc34f4da..ba154a3080fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2474,8 +2474,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); -- cgit v1.2.1 From add41f79a2606b113ae3c9acf3b0b4ca7f441041 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:46 +0000 Subject: rcu: Remove the declaration of call_rcu() in tree.h The call_rcu() function is an external RCU API that is declared in include/linux/rcupdate.h. There is thus no point in redeclaring it in kernel/rcu/tree.h, so this commit removes that redundant declaration. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4dc5debfc84..54ff9896ae31 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -413,7 +413,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); -void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -- cgit v1.2.1 From febc5cacbe2235d3357e1393c340a44e681bd243 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:47 +0000 Subject: rcu: Move gp_state_names[] and gp_state_getname() to tree_stall.h Only tree_stall.h needs to get name from GP state, so this commit moves the gp_state_names[] array and the gp_state_getname() from kernel/rcu/tree.h and kernel/rcu/tree.c, respectively, to kernel/rcu/tree_stall.h. While moving gp_state_names[], this commit uses the GCC syntax to ensure that the right string is associated with the right CPP macro. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ---------- kernel/rcu/tree.h | 12 ------------ kernel/rcu/tree_stall.h | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba154a3080fb..bbb60ed310b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void) return &rcu_state.node[0]; } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Send along grace-period-related data for rcutorture diagnostics. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 54ff9896ae31..9d5986abfc67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -368,18 +368,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..f18adaf3bf39 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -279,6 +279,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Print out diagnostic information for the specified stalled CPU. * -- cgit v1.2.1 From d518ab62b95dd0dd2c02341659a47e883cd7ff45 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 14:48:22 +0100 Subject: rcu: Move rcu_{expedited,normal} definitions into rcupdate.h This commit moves the rcu_{expedited,normal} definitions from kernel/rcu/update.c to include/linux/rcupdate.h to make sure they are in sync, and also to avoid the following warning from sparse: kernel/ksysfs.c:150:5: warning: symbol 'rcu_expedited' was not declared. Should it be static? kernel/ksysfs.c:167:5: warning: symbol 'rcu_normal' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++++ kernel/rcu/update.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index fe470243acdd..bb36379606d0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -896,4 +896,8 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) return false; } +/* kernel/ksysfs.c definitions */ +extern int rcu_expedited; +extern int rcu_normal; + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..294d357abd0c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,9 +51,7 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); -extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); -- cgit v1.2.1 From a2efad4ab00e3c73cb9fe313554beaa707a3a83a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 09:37:11 -0700 Subject: rcu: Switch force_qs_rnp() to for_each_leaf_node_cpu_mask() Currently, force_qs_rnp() uses a for_each_leaf_node_possible_cpu() loop containing a check of the current CPU's bit in ->qsmask. This works, but this commit saves three lines by instead using for_each_leaf_node_cpu_mask(), which combines the functionality of for_each_leaf_node_possible_cpu() and leaf_node_cpu_bit(). This commit also replaces the use of the local variable "bit" with rdp->grpmask. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbb60ed310b1..d95076498488 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2298,14 +2298,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { -- cgit v1.2.1 From f7a0712766ec4275a73fbcc7e010afdbf4ba5f42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:08:30 -0800 Subject: srcu: Apply *_ONCE() to ->srcu_last_gp_end The ->srcu_last_gp_end field is accessed from any CPU at any time by synchronize_srcu(), so non-initialization references need to use READ_ONCE() and WRITE_ONCE(). This commit therefore makes that change. Reported-by: syzbot+08f3e9d26e5541e1ecf2@syzkaller.appspotmail.com Acked-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..21acdff3bd27 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(ssp); - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) @@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long flags; struct srcu_data *sdp; unsigned long t; + unsigned long tlast; /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); @@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); + tlast = READ_ONCE(ssp->srcu_last_gp_end); if (exp_holdoff == 0 || - time_in_range_open(t, ssp->srcu_last_gp_end, - ssp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ -- cgit v1.2.1 From 785da47578c84ee55a1edf358ec37fd01dd40499 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Nov 2019 15:35:26 -0800 Subject: .mailmap: Add entries for old paulmck@kernel.org addresses [ paulmck: Apply Florian Fainelli feedback. ] Signed-off-by: Paul E. McKenney --- .mailmap | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.mailmap b/.mailmap index c24773db04a7..39efbe974395 100644 --- a/.mailmap +++ b/.mailmap @@ -207,6 +207,10 @@ Paolo 'Blaisorblade' Giarrusso Patrick Mochel Paul Burton Paul Burton +Paul E. McKenney +Paul E. McKenney +Paul E. McKenney +Paul E. McKenney Peter A Jonsson Peter Oruba Peter Oruba -- cgit v1.2.1 From d9c650341681b204cdff590b536df0f3f66f636a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Nov 2019 17:01:18 +0000 Subject: powerpc: Remove comment about read_barrier_depends() 'read_barrier_depends()' doesn't exist anymore so stop talking about it. Signed-off-by: Will Deacon Acked-by: Michael Ellerman Signed-off-by: Paul E. McKenney --- arch/powerpc/include/asm/barrier.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index fbe8df433019..123adcefd40f 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -18,8 +18,6 @@ * mb() prevents loads and stores being reordered across this point. * rmb() prevents loads being reordered across this point. * wmb() prevents stores being reordered across this point. - * read_barrier_depends() prevents data-dependent loads being reordered - * across this point (nop on PPC). * * *mb() variants without smp_ prefix must order all types of memory * operations with one another. sync is the only instruction sufficient -- cgit v1.2.1 From 2c0a984f166c3d4dc2b58fae80347e94ba7db783 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 11:36:07 -0800 Subject: rcu: Remove unused stop-machine #include Long ago, RCU used the stop-machine mechanism to implement expedited grace periods, but no longer does so. This commit therefore removes the no-longer-needed #includes of linux/stop_machine.h. Link: https://lwn.net/Articles/805317/ Reported-by: Viresh Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree.h | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d95076498488..878f62f218e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9d5986abfc67..ce90c68c184b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "rcu_segcblist.h" -- cgit v1.2.1 From d47715f50e833f12c5e829ce9dcc4a65104fa74f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 19 Nov 2019 19:57:42 +0100 Subject: kcsan, ubsan: Make KCSAN+UBSAN work together Context: http://lkml.kernel.org/r/fb7e25d8-aba4-3dcf-7761-cb7ecb3ebb71@infradead.org Reported-by: Randy Dunlap Signed-off-by: Marco Elver Acked-by: Randy Dunlap # build-tested Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 1 + lib/Makefile | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index dd15b62ec0b5..df6b7799e492 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 KCSAN_SANITIZE := n KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index 778ab704e3ad..9d5bda950f5f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -279,6 +279,7 @@ obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n KASAN_SANITIZE_ubsan.o := n +KCSAN_SANITIZE_ubsan.o := n CFLAGS_ubsan.o := $(call cc-option, -fno-stack-protector) $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_SBITMAP) += sbitmap.o -- cgit v1.2.1 From c020395b6634b7a674ee6aa91a971b08e268caba Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 26 Nov 2019 15:04:04 +0100 Subject: asm-generic/atomic: Use __always_inline for pure wrappers Prefer __always_inline for atomic wrappers. When building for size (CC_OPTIMIZE_FOR_SIZE), some compilers appear to be less inclined to inline even relatively small static inline functions that are assumed to be inlinable such as atomic ops. This can cause problems, for example in UACCESS regions. By using __always_inline, we let the real implementation and not the wrapper determine the final inlining preference. For x86 tinyconfig we observe: - vmlinux baseline: 1316204 - vmlinux with patch: 1315988 (-216 bytes) This came up when addressing UACCESS warnings with CC_OPTIMIZE_FOR_SIZE in the KCSAN runtime: http://lkml.kernel.org/r/58708908-84a0-0a81-a836-ad97e33dbb62@infradead.org Reported-by: Randy Dunlap Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- include/asm-generic/atomic-instrumented.h | 335 +++++++++++++++--------------- include/asm-generic/atomic-long.h | 331 ++++++++++++++--------------- scripts/atomic/gen-atomic-instrumented.sh | 7 +- scripts/atomic/gen-atomic-long.sh | 3 +- 4 files changed, 340 insertions(+), 336 deletions(-) diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h index 3dc0f38544f6..63869ded73ac 100644 --- a/include/asm-generic/atomic-instrumented.h +++ b/include/asm-generic/atomic-instrumented.h @@ -18,22 +18,23 @@ #define _ASM_GENERIC_ATOMIC_INSTRUMENTED_H #include +#include #include #include -static inline void __atomic_check_read(const volatile void *v, size_t size) +static __always_inline void __atomic_check_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); } -static inline void __atomic_check_write(const volatile void *v, size_t size) +static __always_inline void __atomic_check_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); } -static inline int +static __always_inline int atomic_read(const atomic_t *v) { __atomic_check_read(v, sizeof(*v)); @@ -42,7 +43,7 @@ atomic_read(const atomic_t *v) #define atomic_read atomic_read #if defined(arch_atomic_read_acquire) -static inline int +static __always_inline int atomic_read_acquire(const atomic_t *v) { __atomic_check_read(v, sizeof(*v)); @@ -51,7 +52,7 @@ atomic_read_acquire(const atomic_t *v) #define atomic_read_acquire atomic_read_acquire #endif -static inline void +static __always_inline void atomic_set(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -60,7 +61,7 @@ atomic_set(atomic_t *v, int i) #define atomic_set atomic_set #if defined(arch_atomic_set_release) -static inline void +static __always_inline void atomic_set_release(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -69,7 +70,7 @@ atomic_set_release(atomic_t *v, int i) #define atomic_set_release atomic_set_release #endif -static inline void +static __always_inline void atomic_add(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -78,7 +79,7 @@ atomic_add(int i, atomic_t *v) #define atomic_add atomic_add #if !defined(arch_atomic_add_return_relaxed) || defined(arch_atomic_add_return) -static inline int +static __always_inline int atomic_add_return(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -88,7 +89,7 @@ atomic_add_return(int i, atomic_t *v) #endif #if defined(arch_atomic_add_return_acquire) -static inline int +static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -98,7 +99,7 @@ atomic_add_return_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_add_return_release) -static inline int +static __always_inline int atomic_add_return_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -108,7 +109,7 @@ atomic_add_return_release(int i, atomic_t *v) #endif #if defined(arch_atomic_add_return_relaxed) -static inline int +static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -118,7 +119,7 @@ atomic_add_return_relaxed(int i, atomic_t *v) #endif #if !defined(arch_atomic_fetch_add_relaxed) || defined(arch_atomic_fetch_add) -static inline int +static __always_inline int atomic_fetch_add(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -128,7 +129,7 @@ atomic_fetch_add(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_add_acquire) -static inline int +static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -138,7 +139,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_add_release) -static inline int +static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -148,7 +149,7 @@ atomic_fetch_add_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_add_relaxed) -static inline int +static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -157,7 +158,7 @@ atomic_fetch_add_relaxed(int i, atomic_t *v) #define atomic_fetch_add_relaxed atomic_fetch_add_relaxed #endif -static inline void +static __always_inline void atomic_sub(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -166,7 +167,7 @@ atomic_sub(int i, atomic_t *v) #define atomic_sub atomic_sub #if !defined(arch_atomic_sub_return_relaxed) || defined(arch_atomic_sub_return) -static inline int +static __always_inline int atomic_sub_return(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -176,7 +177,7 @@ atomic_sub_return(int i, atomic_t *v) #endif #if defined(arch_atomic_sub_return_acquire) -static inline int +static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -186,7 +187,7 @@ atomic_sub_return_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_sub_return_release) -static inline int +static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -196,7 +197,7 @@ atomic_sub_return_release(int i, atomic_t *v) #endif #if defined(arch_atomic_sub_return_relaxed) -static inline int +static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -206,7 +207,7 @@ atomic_sub_return_relaxed(int i, atomic_t *v) #endif #if !defined(arch_atomic_fetch_sub_relaxed) || defined(arch_atomic_fetch_sub) -static inline int +static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -216,7 +217,7 @@ atomic_fetch_sub(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_sub_acquire) -static inline int +static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -226,7 +227,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_sub_release) -static inline int +static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -236,7 +237,7 @@ atomic_fetch_sub_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_sub_relaxed) -static inline int +static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -246,7 +247,7 @@ atomic_fetch_sub_relaxed(int i, atomic_t *v) #endif #if defined(arch_atomic_inc) -static inline void +static __always_inline void atomic_inc(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -256,7 +257,7 @@ atomic_inc(atomic_t *v) #endif #if defined(arch_atomic_inc_return) -static inline int +static __always_inline int atomic_inc_return(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -266,7 +267,7 @@ atomic_inc_return(atomic_t *v) #endif #if defined(arch_atomic_inc_return_acquire) -static inline int +static __always_inline int atomic_inc_return_acquire(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -276,7 +277,7 @@ atomic_inc_return_acquire(atomic_t *v) #endif #if defined(arch_atomic_inc_return_release) -static inline int +static __always_inline int atomic_inc_return_release(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -286,7 +287,7 @@ atomic_inc_return_release(atomic_t *v) #endif #if defined(arch_atomic_inc_return_relaxed) -static inline int +static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -296,7 +297,7 @@ atomic_inc_return_relaxed(atomic_t *v) #endif #if defined(arch_atomic_fetch_inc) -static inline int +static __always_inline int atomic_fetch_inc(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -306,7 +307,7 @@ atomic_fetch_inc(atomic_t *v) #endif #if defined(arch_atomic_fetch_inc_acquire) -static inline int +static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -316,7 +317,7 @@ atomic_fetch_inc_acquire(atomic_t *v) #endif #if defined(arch_atomic_fetch_inc_release) -static inline int +static __always_inline int atomic_fetch_inc_release(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -326,7 +327,7 @@ atomic_fetch_inc_release(atomic_t *v) #endif #if defined(arch_atomic_fetch_inc_relaxed) -static inline int +static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -336,7 +337,7 @@ atomic_fetch_inc_relaxed(atomic_t *v) #endif #if defined(arch_atomic_dec) -static inline void +static __always_inline void atomic_dec(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -346,7 +347,7 @@ atomic_dec(atomic_t *v) #endif #if defined(arch_atomic_dec_return) -static inline int +static __always_inline int atomic_dec_return(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -356,7 +357,7 @@ atomic_dec_return(atomic_t *v) #endif #if defined(arch_atomic_dec_return_acquire) -static inline int +static __always_inline int atomic_dec_return_acquire(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -366,7 +367,7 @@ atomic_dec_return_acquire(atomic_t *v) #endif #if defined(arch_atomic_dec_return_release) -static inline int +static __always_inline int atomic_dec_return_release(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -376,7 +377,7 @@ atomic_dec_return_release(atomic_t *v) #endif #if defined(arch_atomic_dec_return_relaxed) -static inline int +static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -386,7 +387,7 @@ atomic_dec_return_relaxed(atomic_t *v) #endif #if defined(arch_atomic_fetch_dec) -static inline int +static __always_inline int atomic_fetch_dec(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -396,7 +397,7 @@ atomic_fetch_dec(atomic_t *v) #endif #if defined(arch_atomic_fetch_dec_acquire) -static inline int +static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -406,7 +407,7 @@ atomic_fetch_dec_acquire(atomic_t *v) #endif #if defined(arch_atomic_fetch_dec_release) -static inline int +static __always_inline int atomic_fetch_dec_release(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -416,7 +417,7 @@ atomic_fetch_dec_release(atomic_t *v) #endif #if defined(arch_atomic_fetch_dec_relaxed) -static inline int +static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -425,7 +426,7 @@ atomic_fetch_dec_relaxed(atomic_t *v) #define atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed #endif -static inline void +static __always_inline void atomic_and(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -434,7 +435,7 @@ atomic_and(int i, atomic_t *v) #define atomic_and atomic_and #if !defined(arch_atomic_fetch_and_relaxed) || defined(arch_atomic_fetch_and) -static inline int +static __always_inline int atomic_fetch_and(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -444,7 +445,7 @@ atomic_fetch_and(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_and_acquire) -static inline int +static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -454,7 +455,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_and_release) -static inline int +static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -464,7 +465,7 @@ atomic_fetch_and_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_and_relaxed) -static inline int +static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -474,7 +475,7 @@ atomic_fetch_and_relaxed(int i, atomic_t *v) #endif #if defined(arch_atomic_andnot) -static inline void +static __always_inline void atomic_andnot(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -484,7 +485,7 @@ atomic_andnot(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_andnot) -static inline int +static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -494,7 +495,7 @@ atomic_fetch_andnot(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_andnot_acquire) -static inline int +static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -504,7 +505,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_andnot_release) -static inline int +static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -514,7 +515,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_andnot_relaxed) -static inline int +static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -523,7 +524,7 @@ atomic_fetch_andnot_relaxed(int i, atomic_t *v) #define atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed #endif -static inline void +static __always_inline void atomic_or(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -532,7 +533,7 @@ atomic_or(int i, atomic_t *v) #define atomic_or atomic_or #if !defined(arch_atomic_fetch_or_relaxed) || defined(arch_atomic_fetch_or) -static inline int +static __always_inline int atomic_fetch_or(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -542,7 +543,7 @@ atomic_fetch_or(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_or_acquire) -static inline int +static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -552,7 +553,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_or_release) -static inline int +static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -562,7 +563,7 @@ atomic_fetch_or_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_or_relaxed) -static inline int +static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -571,7 +572,7 @@ atomic_fetch_or_relaxed(int i, atomic_t *v) #define atomic_fetch_or_relaxed atomic_fetch_or_relaxed #endif -static inline void +static __always_inline void atomic_xor(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -580,7 +581,7 @@ atomic_xor(int i, atomic_t *v) #define atomic_xor atomic_xor #if !defined(arch_atomic_fetch_xor_relaxed) || defined(arch_atomic_fetch_xor) -static inline int +static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -590,7 +591,7 @@ atomic_fetch_xor(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_xor_acquire) -static inline int +static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -600,7 +601,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_xor_release) -static inline int +static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -610,7 +611,7 @@ atomic_fetch_xor_release(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_xor_relaxed) -static inline int +static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -620,7 +621,7 @@ atomic_fetch_xor_relaxed(int i, atomic_t *v) #endif #if !defined(arch_atomic_xchg_relaxed) || defined(arch_atomic_xchg) -static inline int +static __always_inline int atomic_xchg(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -630,7 +631,7 @@ atomic_xchg(atomic_t *v, int i) #endif #if defined(arch_atomic_xchg_acquire) -static inline int +static __always_inline int atomic_xchg_acquire(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -640,7 +641,7 @@ atomic_xchg_acquire(atomic_t *v, int i) #endif #if defined(arch_atomic_xchg_release) -static inline int +static __always_inline int atomic_xchg_release(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -650,7 +651,7 @@ atomic_xchg_release(atomic_t *v, int i) #endif #if defined(arch_atomic_xchg_relaxed) -static inline int +static __always_inline int atomic_xchg_relaxed(atomic_t *v, int i) { __atomic_check_write(v, sizeof(*v)); @@ -660,7 +661,7 @@ atomic_xchg_relaxed(atomic_t *v, int i) #endif #if !defined(arch_atomic_cmpxchg_relaxed) || defined(arch_atomic_cmpxchg) -static inline int +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -670,7 +671,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new) #endif #if defined(arch_atomic_cmpxchg_acquire) -static inline int +static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -680,7 +681,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) #endif #if defined(arch_atomic_cmpxchg_release) -static inline int +static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -690,7 +691,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new) #endif #if defined(arch_atomic_cmpxchg_relaxed) -static inline int +static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -700,7 +701,7 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) #endif #if defined(arch_atomic_try_cmpxchg) -static inline bool +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -711,7 +712,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) #endif #if defined(arch_atomic_try_cmpxchg_acquire) -static inline bool +static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -722,7 +723,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) #endif #if defined(arch_atomic_try_cmpxchg_release) -static inline bool +static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -733,7 +734,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) #endif #if defined(arch_atomic_try_cmpxchg_relaxed) -static inline bool +static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { __atomic_check_write(v, sizeof(*v)); @@ -744,7 +745,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) #endif #if defined(arch_atomic_sub_and_test) -static inline bool +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -754,7 +755,7 @@ atomic_sub_and_test(int i, atomic_t *v) #endif #if defined(arch_atomic_dec_and_test) -static inline bool +static __always_inline bool atomic_dec_and_test(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -764,7 +765,7 @@ atomic_dec_and_test(atomic_t *v) #endif #if defined(arch_atomic_inc_and_test) -static inline bool +static __always_inline bool atomic_inc_and_test(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -774,7 +775,7 @@ atomic_inc_and_test(atomic_t *v) #endif #if defined(arch_atomic_add_negative) -static inline bool +static __always_inline bool atomic_add_negative(int i, atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -784,7 +785,7 @@ atomic_add_negative(int i, atomic_t *v) #endif #if defined(arch_atomic_fetch_add_unless) -static inline int +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { __atomic_check_write(v, sizeof(*v)); @@ -794,7 +795,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) #endif #if defined(arch_atomic_add_unless) -static inline bool +static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { __atomic_check_write(v, sizeof(*v)); @@ -804,7 +805,7 @@ atomic_add_unless(atomic_t *v, int a, int u) #endif #if defined(arch_atomic_inc_not_zero) -static inline bool +static __always_inline bool atomic_inc_not_zero(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -814,7 +815,7 @@ atomic_inc_not_zero(atomic_t *v) #endif #if defined(arch_atomic_inc_unless_negative) -static inline bool +static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -824,7 +825,7 @@ atomic_inc_unless_negative(atomic_t *v) #endif #if defined(arch_atomic_dec_unless_positive) -static inline bool +static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -834,7 +835,7 @@ atomic_dec_unless_positive(atomic_t *v) #endif #if defined(arch_atomic_dec_if_positive) -static inline int +static __always_inline int atomic_dec_if_positive(atomic_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -843,7 +844,7 @@ atomic_dec_if_positive(atomic_t *v) #define atomic_dec_if_positive atomic_dec_if_positive #endif -static inline s64 +static __always_inline s64 atomic64_read(const atomic64_t *v) { __atomic_check_read(v, sizeof(*v)); @@ -852,7 +853,7 @@ atomic64_read(const atomic64_t *v) #define atomic64_read atomic64_read #if defined(arch_atomic64_read_acquire) -static inline s64 +static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { __atomic_check_read(v, sizeof(*v)); @@ -861,7 +862,7 @@ atomic64_read_acquire(const atomic64_t *v) #define atomic64_read_acquire atomic64_read_acquire #endif -static inline void +static __always_inline void atomic64_set(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -870,7 +871,7 @@ atomic64_set(atomic64_t *v, s64 i) #define atomic64_set atomic64_set #if defined(arch_atomic64_set_release) -static inline void +static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -879,7 +880,7 @@ atomic64_set_release(atomic64_t *v, s64 i) #define atomic64_set_release atomic64_set_release #endif -static inline void +static __always_inline void atomic64_add(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -888,7 +889,7 @@ atomic64_add(s64 i, atomic64_t *v) #define atomic64_add atomic64_add #if !defined(arch_atomic64_add_return_relaxed) || defined(arch_atomic64_add_return) -static inline s64 +static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -898,7 +899,7 @@ atomic64_add_return(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_add_return_acquire) -static inline s64 +static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -908,7 +909,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_add_return_release) -static inline s64 +static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -918,7 +919,7 @@ atomic64_add_return_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_add_return_relaxed) -static inline s64 +static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -928,7 +929,7 @@ atomic64_add_return_relaxed(s64 i, atomic64_t *v) #endif #if !defined(arch_atomic64_fetch_add_relaxed) || defined(arch_atomic64_fetch_add) -static inline s64 +static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -938,7 +939,7 @@ atomic64_fetch_add(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_add_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -948,7 +949,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_add_release) -static inline s64 +static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -958,7 +959,7 @@ atomic64_fetch_add_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_add_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -967,7 +968,7 @@ atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) #define atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed #endif -static inline void +static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -976,7 +977,7 @@ atomic64_sub(s64 i, atomic64_t *v) #define atomic64_sub atomic64_sub #if !defined(arch_atomic64_sub_return_relaxed) || defined(arch_atomic64_sub_return) -static inline s64 +static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -986,7 +987,7 @@ atomic64_sub_return(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_sub_return_acquire) -static inline s64 +static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -996,7 +997,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_sub_return_release) -static inline s64 +static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1006,7 +1007,7 @@ atomic64_sub_return_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_sub_return_relaxed) -static inline s64 +static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1016,7 +1017,7 @@ atomic64_sub_return_relaxed(s64 i, atomic64_t *v) #endif #if !defined(arch_atomic64_fetch_sub_relaxed) || defined(arch_atomic64_fetch_sub) -static inline s64 +static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1026,7 +1027,7 @@ atomic64_fetch_sub(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_sub_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1036,7 +1037,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_sub_release) -static inline s64 +static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1046,7 +1047,7 @@ atomic64_fetch_sub_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_sub_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1056,7 +1057,7 @@ atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_inc) -static inline void +static __always_inline void atomic64_inc(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1066,7 +1067,7 @@ atomic64_inc(atomic64_t *v) #endif #if defined(arch_atomic64_inc_return) -static inline s64 +static __always_inline s64 atomic64_inc_return(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1076,7 +1077,7 @@ atomic64_inc_return(atomic64_t *v) #endif #if defined(arch_atomic64_inc_return_acquire) -static inline s64 +static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1086,7 +1087,7 @@ atomic64_inc_return_acquire(atomic64_t *v) #endif #if defined(arch_atomic64_inc_return_release) -static inline s64 +static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1096,7 +1097,7 @@ atomic64_inc_return_release(atomic64_t *v) #endif #if defined(arch_atomic64_inc_return_relaxed) -static inline s64 +static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1106,7 +1107,7 @@ atomic64_inc_return_relaxed(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_inc) -static inline s64 +static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1116,7 +1117,7 @@ atomic64_fetch_inc(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_inc_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1126,7 +1127,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_inc_release) -static inline s64 +static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1136,7 +1137,7 @@ atomic64_fetch_inc_release(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_inc_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1146,7 +1147,7 @@ atomic64_fetch_inc_relaxed(atomic64_t *v) #endif #if defined(arch_atomic64_dec) -static inline void +static __always_inline void atomic64_dec(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1156,7 +1157,7 @@ atomic64_dec(atomic64_t *v) #endif #if defined(arch_atomic64_dec_return) -static inline s64 +static __always_inline s64 atomic64_dec_return(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1166,7 +1167,7 @@ atomic64_dec_return(atomic64_t *v) #endif #if defined(arch_atomic64_dec_return_acquire) -static inline s64 +static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1176,7 +1177,7 @@ atomic64_dec_return_acquire(atomic64_t *v) #endif #if defined(arch_atomic64_dec_return_release) -static inline s64 +static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1186,7 +1187,7 @@ atomic64_dec_return_release(atomic64_t *v) #endif #if defined(arch_atomic64_dec_return_relaxed) -static inline s64 +static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1196,7 +1197,7 @@ atomic64_dec_return_relaxed(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_dec) -static inline s64 +static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1206,7 +1207,7 @@ atomic64_fetch_dec(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_dec_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1216,7 +1217,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_dec_release) -static inline s64 +static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1226,7 +1227,7 @@ atomic64_fetch_dec_release(atomic64_t *v) #endif #if defined(arch_atomic64_fetch_dec_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1235,7 +1236,7 @@ atomic64_fetch_dec_relaxed(atomic64_t *v) #define atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed #endif -static inline void +static __always_inline void atomic64_and(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1244,7 +1245,7 @@ atomic64_and(s64 i, atomic64_t *v) #define atomic64_and atomic64_and #if !defined(arch_atomic64_fetch_and_relaxed) || defined(arch_atomic64_fetch_and) -static inline s64 +static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1254,7 +1255,7 @@ atomic64_fetch_and(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_and_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1264,7 +1265,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_and_release) -static inline s64 +static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1274,7 +1275,7 @@ atomic64_fetch_and_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_and_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1284,7 +1285,7 @@ atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_andnot) -static inline void +static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1294,7 +1295,7 @@ atomic64_andnot(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_andnot) -static inline s64 +static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1304,7 +1305,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_andnot_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1314,7 +1315,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_andnot_release) -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1324,7 +1325,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_andnot_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1333,7 +1334,7 @@ atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) #define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed #endif -static inline void +static __always_inline void atomic64_or(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1342,7 +1343,7 @@ atomic64_or(s64 i, atomic64_t *v) #define atomic64_or atomic64_or #if !defined(arch_atomic64_fetch_or_relaxed) || defined(arch_atomic64_fetch_or) -static inline s64 +static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1352,7 +1353,7 @@ atomic64_fetch_or(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_or_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1362,7 +1363,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_or_release) -static inline s64 +static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1372,7 +1373,7 @@ atomic64_fetch_or_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_or_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1381,7 +1382,7 @@ atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) #define atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed #endif -static inline void +static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1390,7 +1391,7 @@ atomic64_xor(s64 i, atomic64_t *v) #define atomic64_xor atomic64_xor #if !defined(arch_atomic64_fetch_xor_relaxed) || defined(arch_atomic64_fetch_xor) -static inline s64 +static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1400,7 +1401,7 @@ atomic64_fetch_xor(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_xor_acquire) -static inline s64 +static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1410,7 +1411,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_xor_release) -static inline s64 +static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1420,7 +1421,7 @@ atomic64_fetch_xor_release(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_xor_relaxed) -static inline s64 +static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1430,7 +1431,7 @@ atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) #endif #if !defined(arch_atomic64_xchg_relaxed) || defined(arch_atomic64_xchg) -static inline s64 +static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -1440,7 +1441,7 @@ atomic64_xchg(atomic64_t *v, s64 i) #endif #if defined(arch_atomic64_xchg_acquire) -static inline s64 +static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -1450,7 +1451,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i) #endif #if defined(arch_atomic64_xchg_release) -static inline s64 +static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -1460,7 +1461,7 @@ atomic64_xchg_release(atomic64_t *v, s64 i) #endif #if defined(arch_atomic64_xchg_relaxed) -static inline s64 +static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 i) { __atomic_check_write(v, sizeof(*v)); @@ -1470,7 +1471,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 i) #endif #if !defined(arch_atomic64_cmpxchg_relaxed) || defined(arch_atomic64_cmpxchg) -static inline s64 +static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1480,7 +1481,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) #endif #if defined(arch_atomic64_cmpxchg_acquire) -static inline s64 +static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1490,7 +1491,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) #endif #if defined(arch_atomic64_cmpxchg_release) -static inline s64 +static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1500,7 +1501,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) #endif #if defined(arch_atomic64_cmpxchg_relaxed) -static inline s64 +static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1510,7 +1511,7 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) #endif #if defined(arch_atomic64_try_cmpxchg) -static inline bool +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1521,7 +1522,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) #endif #if defined(arch_atomic64_try_cmpxchg_acquire) -static inline bool +static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1532,7 +1533,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) #endif #if defined(arch_atomic64_try_cmpxchg_release) -static inline bool +static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1543,7 +1544,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) #endif #if defined(arch_atomic64_try_cmpxchg_relaxed) -static inline bool +static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { __atomic_check_write(v, sizeof(*v)); @@ -1554,7 +1555,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) #endif #if defined(arch_atomic64_sub_and_test) -static inline bool +static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1564,7 +1565,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_dec_and_test) -static inline bool +static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1574,7 +1575,7 @@ atomic64_dec_and_test(atomic64_t *v) #endif #if defined(arch_atomic64_inc_and_test) -static inline bool +static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1584,7 +1585,7 @@ atomic64_inc_and_test(atomic64_t *v) #endif #if defined(arch_atomic64_add_negative) -static inline bool +static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1594,7 +1595,7 @@ atomic64_add_negative(s64 i, atomic64_t *v) #endif #if defined(arch_atomic64_fetch_add_unless) -static inline s64 +static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { __atomic_check_write(v, sizeof(*v)); @@ -1604,7 +1605,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) #endif #if defined(arch_atomic64_add_unless) -static inline bool +static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { __atomic_check_write(v, sizeof(*v)); @@ -1614,7 +1615,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) #endif #if defined(arch_atomic64_inc_not_zero) -static inline bool +static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1624,7 +1625,7 @@ atomic64_inc_not_zero(atomic64_t *v) #endif #if defined(arch_atomic64_inc_unless_negative) -static inline bool +static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1634,7 +1635,7 @@ atomic64_inc_unless_negative(atomic64_t *v) #endif #if defined(arch_atomic64_dec_unless_positive) -static inline bool +static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1644,7 +1645,7 @@ atomic64_dec_unless_positive(atomic64_t *v) #endif #if defined(arch_atomic64_dec_if_positive) -static inline s64 +static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { __atomic_check_write(v, sizeof(*v)); @@ -1798,4 +1799,4 @@ atomic64_dec_if_positive(atomic64_t *v) }) #endif /* _ASM_GENERIC_ATOMIC_INSTRUMENTED_H */ -// beea41c2a0f2c69e4958ed71bf26f59740fa4b12 +// 7b7e2af0e75c8ecb6f02298a7075f503f30d244c diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h index 881c7e27af28..073cf40f431b 100644 --- a/include/asm-generic/atomic-long.h +++ b/include/asm-generic/atomic-long.h @@ -6,6 +6,7 @@ #ifndef _ASM_GENERIC_ATOMIC_LONG_H #define _ASM_GENERIC_ATOMIC_LONG_H +#include #include #ifdef CONFIG_64BIT @@ -22,493 +23,493 @@ typedef atomic_t atomic_long_t; #ifdef CONFIG_64BIT -static inline long +static __always_inline long atomic_long_read(const atomic_long_t *v) { return atomic64_read(v); } -static inline long +static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { return atomic64_read_acquire(v); } -static inline void +static __always_inline void atomic_long_set(atomic_long_t *v, long i) { atomic64_set(v, i); } -static inline void +static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { atomic64_set_release(v, i); } -static inline void +static __always_inline void atomic_long_add(long i, atomic_long_t *v) { atomic64_add(i, v); } -static inline long +static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { return atomic64_add_return(i, v); } -static inline long +static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { return atomic64_add_return_acquire(i, v); } -static inline long +static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { return atomic64_add_return_release(i, v); } -static inline long +static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { return atomic64_add_return_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { return atomic64_fetch_add(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { return atomic64_fetch_add_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { return atomic64_fetch_add_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_add_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { atomic64_sub(i, v); } -static inline long +static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { return atomic64_sub_return(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { return atomic64_sub_return_acquire(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { return atomic64_sub_return_release(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { return atomic64_sub_return_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { return atomic64_fetch_sub(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { return atomic64_fetch_sub_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { return atomic64_fetch_sub_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_sub_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_inc(atomic_long_t *v) { atomic64_inc(v); } -static inline long +static __always_inline long atomic_long_inc_return(atomic_long_t *v) { return atomic64_inc_return(v); } -static inline long +static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { return atomic64_inc_return_acquire(v); } -static inline long +static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { return atomic64_inc_return_release(v); } -static inline long +static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { return atomic64_inc_return_relaxed(v); } -static inline long +static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { return atomic64_fetch_inc(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { return atomic64_fetch_inc_acquire(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { return atomic64_fetch_inc_release(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { return atomic64_fetch_inc_relaxed(v); } -static inline void +static __always_inline void atomic_long_dec(atomic_long_t *v) { atomic64_dec(v); } -static inline long +static __always_inline long atomic_long_dec_return(atomic_long_t *v) { return atomic64_dec_return(v); } -static inline long +static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { return atomic64_dec_return_acquire(v); } -static inline long +static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { return atomic64_dec_return_release(v); } -static inline long +static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { return atomic64_dec_return_relaxed(v); } -static inline long +static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { return atomic64_fetch_dec(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { return atomic64_fetch_dec_acquire(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { return atomic64_fetch_dec_release(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { return atomic64_fetch_dec_relaxed(v); } -static inline void +static __always_inline void atomic_long_and(long i, atomic_long_t *v) { atomic64_and(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { return atomic64_fetch_and(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { return atomic64_fetch_and_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { return atomic64_fetch_and_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_and_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { atomic64_andnot(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { return atomic64_fetch_andnot(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { return atomic64_fetch_andnot_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { return atomic64_fetch_andnot_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_andnot_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_or(long i, atomic_long_t *v) { atomic64_or(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { return atomic64_fetch_or(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { return atomic64_fetch_or_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { return atomic64_fetch_or_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_or_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { atomic64_xor(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { return atomic64_fetch_xor(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { return atomic64_fetch_xor_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { return atomic64_fetch_xor_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { return atomic64_fetch_xor_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_xchg(atomic_long_t *v, long i) { return atomic64_xchg(v, i); } -static inline long +static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long i) { return atomic64_xchg_acquire(v, i); } -static inline long +static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long i) { return atomic64_xchg_release(v, i); } -static inline long +static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long i) { return atomic64_xchg_relaxed(v, i); } -static inline long +static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { return atomic64_cmpxchg(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { return atomic64_cmpxchg_acquire(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { return atomic64_cmpxchg_release(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { return atomic64_cmpxchg_relaxed(v, old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { return atomic64_try_cmpxchg(v, (s64 *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { return atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { return atomic64_try_cmpxchg_release(v, (s64 *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { return atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); } -static inline bool +static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { return atomic64_sub_and_test(i, v); } -static inline bool +static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { return atomic64_dec_and_test(v); } -static inline bool +static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { return atomic64_inc_and_test(v); } -static inline bool +static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { return atomic64_add_negative(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { return atomic64_fetch_add_unless(v, a, u); } -static inline bool +static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { return atomic64_add_unless(v, a, u); } -static inline bool +static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { return atomic64_inc_not_zero(v); } -static inline bool +static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { return atomic64_inc_unless_negative(v); } -static inline bool +static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { return atomic64_dec_unless_positive(v); } -static inline long +static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { return atomic64_dec_if_positive(v); @@ -516,493 +517,493 @@ atomic_long_dec_if_positive(atomic_long_t *v) #else /* CONFIG_64BIT */ -static inline long +static __always_inline long atomic_long_read(const atomic_long_t *v) { return atomic_read(v); } -static inline long +static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { return atomic_read_acquire(v); } -static inline void +static __always_inline void atomic_long_set(atomic_long_t *v, long i) { atomic_set(v, i); } -static inline void +static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { atomic_set_release(v, i); } -static inline void +static __always_inline void atomic_long_add(long i, atomic_long_t *v) { atomic_add(i, v); } -static inline long +static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { return atomic_add_return(i, v); } -static inline long +static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { return atomic_add_return_acquire(i, v); } -static inline long +static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { return atomic_add_return_release(i, v); } -static inline long +static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { return atomic_add_return_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { return atomic_fetch_add(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { return atomic_fetch_add_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { return atomic_fetch_add_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { return atomic_fetch_add_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { atomic_sub(i, v); } -static inline long +static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { return atomic_sub_return(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { return atomic_sub_return_acquire(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { return atomic_sub_return_release(i, v); } -static inline long +static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { return atomic_sub_return_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { return atomic_fetch_sub(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { return atomic_fetch_sub_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { return atomic_fetch_sub_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { return atomic_fetch_sub_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_inc(atomic_long_t *v) { atomic_inc(v); } -static inline long +static __always_inline long atomic_long_inc_return(atomic_long_t *v) { return atomic_inc_return(v); } -static inline long +static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { return atomic_inc_return_acquire(v); } -static inline long +static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { return atomic_inc_return_release(v); } -static inline long +static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { return atomic_inc_return_relaxed(v); } -static inline long +static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { return atomic_fetch_inc(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { return atomic_fetch_inc_acquire(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { return atomic_fetch_inc_release(v); } -static inline long +static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { return atomic_fetch_inc_relaxed(v); } -static inline void +static __always_inline void atomic_long_dec(atomic_long_t *v) { atomic_dec(v); } -static inline long +static __always_inline long atomic_long_dec_return(atomic_long_t *v) { return atomic_dec_return(v); } -static inline long +static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { return atomic_dec_return_acquire(v); } -static inline long +static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { return atomic_dec_return_release(v); } -static inline long +static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { return atomic_dec_return_relaxed(v); } -static inline long +static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { return atomic_fetch_dec(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { return atomic_fetch_dec_acquire(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { return atomic_fetch_dec_release(v); } -static inline long +static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { return atomic_fetch_dec_relaxed(v); } -static inline void +static __always_inline void atomic_long_and(long i, atomic_long_t *v) { atomic_and(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { return atomic_fetch_and(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { return atomic_fetch_and_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { return atomic_fetch_and_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { return atomic_fetch_and_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { atomic_andnot(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { return atomic_fetch_andnot(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { return atomic_fetch_andnot_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { return atomic_fetch_andnot_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { return atomic_fetch_andnot_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_or(long i, atomic_long_t *v) { atomic_or(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { return atomic_fetch_or(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { return atomic_fetch_or_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { return atomic_fetch_or_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { return atomic_fetch_or_relaxed(i, v); } -static inline void +static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { atomic_xor(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { return atomic_fetch_xor(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { return atomic_fetch_xor_acquire(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { return atomic_fetch_xor_release(i, v); } -static inline long +static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { return atomic_fetch_xor_relaxed(i, v); } -static inline long +static __always_inline long atomic_long_xchg(atomic_long_t *v, long i) { return atomic_xchg(v, i); } -static inline long +static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long i) { return atomic_xchg_acquire(v, i); } -static inline long +static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long i) { return atomic_xchg_release(v, i); } -static inline long +static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long i) { return atomic_xchg_relaxed(v, i); } -static inline long +static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { return atomic_cmpxchg(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { return atomic_cmpxchg_acquire(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { return atomic_cmpxchg_release(v, old, new); } -static inline long +static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { return atomic_cmpxchg_relaxed(v, old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { return atomic_try_cmpxchg(v, (int *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { return atomic_try_cmpxchg_acquire(v, (int *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { return atomic_try_cmpxchg_release(v, (int *)old, new); } -static inline bool +static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { return atomic_try_cmpxchg_relaxed(v, (int *)old, new); } -static inline bool +static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { return atomic_sub_and_test(i, v); } -static inline bool +static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { return atomic_dec_and_test(v); } -static inline bool +static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { return atomic_inc_and_test(v); } -static inline bool +static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { return atomic_add_negative(i, v); } -static inline long +static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { return atomic_fetch_add_unless(v, a, u); } -static inline bool +static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { return atomic_add_unless(v, a, u); } -static inline bool +static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { return atomic_inc_not_zero(v); } -static inline bool +static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { return atomic_inc_unless_negative(v); } -static inline bool +static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { return atomic_dec_unless_positive(v); } -static inline long +static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { return atomic_dec_if_positive(v); @@ -1010,4 +1011,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* CONFIG_64BIT */ #endif /* _ASM_GENERIC_ATOMIC_LONG_H */ -// 77558968132ce4f911ad53f6f52ce423006f6268 +// a624200981f552b2c6be4f32fe44da8289f30d87 diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh index 8b8b2a6f8d68..fb4222548b22 100755 --- a/scripts/atomic/gen-atomic-instrumented.sh +++ b/scripts/atomic/gen-atomic-instrumented.sh @@ -84,7 +84,7 @@ gen_proto_order_variant() [ ! -z "${guard}" ] && printf "#if ${guard}\n" cat < +#include #include #include -static inline void __atomic_check_read(const volatile void *v, size_t size) +static __always_inline void __atomic_check_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); } -static inline void __atomic_check_write(const volatile void *v, size_t size) +static __always_inline void __atomic_check_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); diff --git a/scripts/atomic/gen-atomic-long.sh b/scripts/atomic/gen-atomic-long.sh index c240a7231b2e..e318d3f92e53 100755 --- a/scripts/atomic/gen-atomic-long.sh +++ b/scripts/atomic/gen-atomic-long.sh @@ -46,7 +46,7 @@ gen_proto_order_variant() local retstmt="$(gen_ret_stmt "${meta}")" cat < #include #ifdef CONFIG_64BIT -- cgit v1.2.1 From 944bc9cca7c392879fa2c3f911bbef7422707679 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 26 Nov 2019 15:04:05 +0100 Subject: asm-generic/atomic: Use __always_inline for fallback wrappers Use __always_inline for atomic fallback wrappers. When building for size (CC_OPTIMIZE_FOR_SIZE), some compilers appear to be less inclined to inline even relatively small static inline functions that are assumed to be inlinable such as atomic ops. This can cause problems, for example in UACCESS regions. While the fallback wrappers aren't pure wrappers, they are trivial nonetheless, and the function they wrap should determine the final inlining policy. For x86 tinyconfig we observe: - vmlinux baseline: 1315988 - vmlinux with patch: 1315928 (-60 bytes) Suggested-by: Mark Rutland Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- include/linux/atomic-fallback.h | 340 ++++++++++++++------------- scripts/atomic/fallbacks/acquire | 2 +- scripts/atomic/fallbacks/add_negative | 2 +- scripts/atomic/fallbacks/add_unless | 2 +- scripts/atomic/fallbacks/andnot | 2 +- scripts/atomic/fallbacks/dec | 2 +- scripts/atomic/fallbacks/dec_and_test | 2 +- scripts/atomic/fallbacks/dec_if_positive | 2 +- scripts/atomic/fallbacks/dec_unless_positive | 2 +- scripts/atomic/fallbacks/fence | 2 +- scripts/atomic/fallbacks/fetch_add_unless | 2 +- scripts/atomic/fallbacks/inc | 2 +- scripts/atomic/fallbacks/inc_and_test | 2 +- scripts/atomic/fallbacks/inc_not_zero | 2 +- scripts/atomic/fallbacks/inc_unless_negative | 2 +- scripts/atomic/fallbacks/read_acquire | 2 +- scripts/atomic/fallbacks/release | 2 +- scripts/atomic/fallbacks/set_release | 2 +- scripts/atomic/fallbacks/sub_and_test | 2 +- scripts/atomic/fallbacks/try_cmpxchg | 2 +- scripts/atomic/gen-atomic-fallback.sh | 2 + 21 files changed, 192 insertions(+), 188 deletions(-) diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h index a7d240e465c0..656b5489b673 100644 --- a/include/linux/atomic-fallback.h +++ b/include/linux/atomic-fallback.h @@ -6,6 +6,8 @@ #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H +#include + #ifndef xchg_relaxed #define xchg_relaxed xchg #define xchg_acquire xchg @@ -76,7 +78,7 @@ #endif /* cmpxchg64_relaxed */ #ifndef atomic_read_acquire -static inline int +static __always_inline int atomic_read_acquire(const atomic_t *v) { return smp_load_acquire(&(v)->counter); @@ -85,7 +87,7 @@ atomic_read_acquire(const atomic_t *v) #endif #ifndef atomic_set_release -static inline void +static __always_inline void atomic_set_release(atomic_t *v, int i) { smp_store_release(&(v)->counter, i); @@ -100,7 +102,7 @@ atomic_set_release(atomic_t *v, int i) #else /* atomic_add_return_relaxed */ #ifndef atomic_add_return_acquire -static inline int +static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { int ret = atomic_add_return_relaxed(i, v); @@ -111,7 +113,7 @@ atomic_add_return_acquire(int i, atomic_t *v) #endif #ifndef atomic_add_return_release -static inline int +static __always_inline int atomic_add_return_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -121,7 +123,7 @@ atomic_add_return_release(int i, atomic_t *v) #endif #ifndef atomic_add_return -static inline int +static __always_inline int atomic_add_return(int i, atomic_t *v) { int ret; @@ -142,7 +144,7 @@ atomic_add_return(int i, atomic_t *v) #else /* atomic_fetch_add_relaxed */ #ifndef atomic_fetch_add_acquire -static inline int +static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { int ret = atomic_fetch_add_relaxed(i, v); @@ -153,7 +155,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_add_release -static inline int +static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -163,7 +165,7 @@ atomic_fetch_add_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_add -static inline int +static __always_inline int atomic_fetch_add(int i, atomic_t *v) { int ret; @@ -184,7 +186,7 @@ atomic_fetch_add(int i, atomic_t *v) #else /* atomic_sub_return_relaxed */ #ifndef atomic_sub_return_acquire -static inline int +static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { int ret = atomic_sub_return_relaxed(i, v); @@ -195,7 +197,7 @@ atomic_sub_return_acquire(int i, atomic_t *v) #endif #ifndef atomic_sub_return_release -static inline int +static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -205,7 +207,7 @@ atomic_sub_return_release(int i, atomic_t *v) #endif #ifndef atomic_sub_return -static inline int +static __always_inline int atomic_sub_return(int i, atomic_t *v) { int ret; @@ -226,7 +228,7 @@ atomic_sub_return(int i, atomic_t *v) #else /* atomic_fetch_sub_relaxed */ #ifndef atomic_fetch_sub_acquire -static inline int +static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { int ret = atomic_fetch_sub_relaxed(i, v); @@ -237,7 +239,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_sub_release -static inline int +static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -247,7 +249,7 @@ atomic_fetch_sub_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_sub -static inline int +static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { int ret; @@ -262,7 +264,7 @@ atomic_fetch_sub(int i, atomic_t *v) #endif /* atomic_fetch_sub_relaxed */ #ifndef atomic_inc -static inline void +static __always_inline void atomic_inc(atomic_t *v) { atomic_add(1, v); @@ -278,7 +280,7 @@ atomic_inc(atomic_t *v) #endif /* atomic_inc_return */ #ifndef atomic_inc_return -static inline int +static __always_inline int atomic_inc_return(atomic_t *v) { return atomic_add_return(1, v); @@ -287,7 +289,7 @@ atomic_inc_return(atomic_t *v) #endif #ifndef atomic_inc_return_acquire -static inline int +static __always_inline int atomic_inc_return_acquire(atomic_t *v) { return atomic_add_return_acquire(1, v); @@ -296,7 +298,7 @@ atomic_inc_return_acquire(atomic_t *v) #endif #ifndef atomic_inc_return_release -static inline int +static __always_inline int atomic_inc_return_release(atomic_t *v) { return atomic_add_return_release(1, v); @@ -305,7 +307,7 @@ atomic_inc_return_release(atomic_t *v) #endif #ifndef atomic_inc_return_relaxed -static inline int +static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { return atomic_add_return_relaxed(1, v); @@ -316,7 +318,7 @@ atomic_inc_return_relaxed(atomic_t *v) #else /* atomic_inc_return_relaxed */ #ifndef atomic_inc_return_acquire -static inline int +static __always_inline int atomic_inc_return_acquire(atomic_t *v) { int ret = atomic_inc_return_relaxed(v); @@ -327,7 +329,7 @@ atomic_inc_return_acquire(atomic_t *v) #endif #ifndef atomic_inc_return_release -static inline int +static __always_inline int atomic_inc_return_release(atomic_t *v) { __atomic_release_fence(); @@ -337,7 +339,7 @@ atomic_inc_return_release(atomic_t *v) #endif #ifndef atomic_inc_return -static inline int +static __always_inline int atomic_inc_return(atomic_t *v) { int ret; @@ -359,7 +361,7 @@ atomic_inc_return(atomic_t *v) #endif /* atomic_fetch_inc */ #ifndef atomic_fetch_inc -static inline int +static __always_inline int atomic_fetch_inc(atomic_t *v) { return atomic_fetch_add(1, v); @@ -368,7 +370,7 @@ atomic_fetch_inc(atomic_t *v) #endif #ifndef atomic_fetch_inc_acquire -static inline int +static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { return atomic_fetch_add_acquire(1, v); @@ -377,7 +379,7 @@ atomic_fetch_inc_acquire(atomic_t *v) #endif #ifndef atomic_fetch_inc_release -static inline int +static __always_inline int atomic_fetch_inc_release(atomic_t *v) { return atomic_fetch_add_release(1, v); @@ -386,7 +388,7 @@ atomic_fetch_inc_release(atomic_t *v) #endif #ifndef atomic_fetch_inc_relaxed -static inline int +static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { return atomic_fetch_add_relaxed(1, v); @@ -397,7 +399,7 @@ atomic_fetch_inc_relaxed(atomic_t *v) #else /* atomic_fetch_inc_relaxed */ #ifndef atomic_fetch_inc_acquire -static inline int +static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { int ret = atomic_fetch_inc_relaxed(v); @@ -408,7 +410,7 @@ atomic_fetch_inc_acquire(atomic_t *v) #endif #ifndef atomic_fetch_inc_release -static inline int +static __always_inline int atomic_fetch_inc_release(atomic_t *v) { __atomic_release_fence(); @@ -418,7 +420,7 @@ atomic_fetch_inc_release(atomic_t *v) #endif #ifndef atomic_fetch_inc -static inline int +static __always_inline int atomic_fetch_inc(atomic_t *v) { int ret; @@ -433,7 +435,7 @@ atomic_fetch_inc(atomic_t *v) #endif /* atomic_fetch_inc_relaxed */ #ifndef atomic_dec -static inline void +static __always_inline void atomic_dec(atomic_t *v) { atomic_sub(1, v); @@ -449,7 +451,7 @@ atomic_dec(atomic_t *v) #endif /* atomic_dec_return */ #ifndef atomic_dec_return -static inline int +static __always_inline int atomic_dec_return(atomic_t *v) { return atomic_sub_return(1, v); @@ -458,7 +460,7 @@ atomic_dec_return(atomic_t *v) #endif #ifndef atomic_dec_return_acquire -static inline int +static __always_inline int atomic_dec_return_acquire(atomic_t *v) { return atomic_sub_return_acquire(1, v); @@ -467,7 +469,7 @@ atomic_dec_return_acquire(atomic_t *v) #endif #ifndef atomic_dec_return_release -static inline int +static __always_inline int atomic_dec_return_release(atomic_t *v) { return atomic_sub_return_release(1, v); @@ -476,7 +478,7 @@ atomic_dec_return_release(atomic_t *v) #endif #ifndef atomic_dec_return_relaxed -static inline int +static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { return atomic_sub_return_relaxed(1, v); @@ -487,7 +489,7 @@ atomic_dec_return_relaxed(atomic_t *v) #else /* atomic_dec_return_relaxed */ #ifndef atomic_dec_return_acquire -static inline int +static __always_inline int atomic_dec_return_acquire(atomic_t *v) { int ret = atomic_dec_return_relaxed(v); @@ -498,7 +500,7 @@ atomic_dec_return_acquire(atomic_t *v) #endif #ifndef atomic_dec_return_release -static inline int +static __always_inline int atomic_dec_return_release(atomic_t *v) { __atomic_release_fence(); @@ -508,7 +510,7 @@ atomic_dec_return_release(atomic_t *v) #endif #ifndef atomic_dec_return -static inline int +static __always_inline int atomic_dec_return(atomic_t *v) { int ret; @@ -530,7 +532,7 @@ atomic_dec_return(atomic_t *v) #endif /* atomic_fetch_dec */ #ifndef atomic_fetch_dec -static inline int +static __always_inline int atomic_fetch_dec(atomic_t *v) { return atomic_fetch_sub(1, v); @@ -539,7 +541,7 @@ atomic_fetch_dec(atomic_t *v) #endif #ifndef atomic_fetch_dec_acquire -static inline int +static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { return atomic_fetch_sub_acquire(1, v); @@ -548,7 +550,7 @@ atomic_fetch_dec_acquire(atomic_t *v) #endif #ifndef atomic_fetch_dec_release -static inline int +static __always_inline int atomic_fetch_dec_release(atomic_t *v) { return atomic_fetch_sub_release(1, v); @@ -557,7 +559,7 @@ atomic_fetch_dec_release(atomic_t *v) #endif #ifndef atomic_fetch_dec_relaxed -static inline int +static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { return atomic_fetch_sub_relaxed(1, v); @@ -568,7 +570,7 @@ atomic_fetch_dec_relaxed(atomic_t *v) #else /* atomic_fetch_dec_relaxed */ #ifndef atomic_fetch_dec_acquire -static inline int +static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { int ret = atomic_fetch_dec_relaxed(v); @@ -579,7 +581,7 @@ atomic_fetch_dec_acquire(atomic_t *v) #endif #ifndef atomic_fetch_dec_release -static inline int +static __always_inline int atomic_fetch_dec_release(atomic_t *v) { __atomic_release_fence(); @@ -589,7 +591,7 @@ atomic_fetch_dec_release(atomic_t *v) #endif #ifndef atomic_fetch_dec -static inline int +static __always_inline int atomic_fetch_dec(atomic_t *v) { int ret; @@ -610,7 +612,7 @@ atomic_fetch_dec(atomic_t *v) #else /* atomic_fetch_and_relaxed */ #ifndef atomic_fetch_and_acquire -static inline int +static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { int ret = atomic_fetch_and_relaxed(i, v); @@ -621,7 +623,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_and_release -static inline int +static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -631,7 +633,7 @@ atomic_fetch_and_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_and -static inline int +static __always_inline int atomic_fetch_and(int i, atomic_t *v) { int ret; @@ -646,7 +648,7 @@ atomic_fetch_and(int i, atomic_t *v) #endif /* atomic_fetch_and_relaxed */ #ifndef atomic_andnot -static inline void +static __always_inline void atomic_andnot(int i, atomic_t *v) { atomic_and(~i, v); @@ -662,7 +664,7 @@ atomic_andnot(int i, atomic_t *v) #endif /* atomic_fetch_andnot */ #ifndef atomic_fetch_andnot -static inline int +static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { return atomic_fetch_and(~i, v); @@ -671,7 +673,7 @@ atomic_fetch_andnot(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_acquire -static inline int +static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { return atomic_fetch_and_acquire(~i, v); @@ -680,7 +682,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_release -static inline int +static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { return atomic_fetch_and_release(~i, v); @@ -689,7 +691,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_relaxed -static inline int +static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { return atomic_fetch_and_relaxed(~i, v); @@ -700,7 +702,7 @@ atomic_fetch_andnot_relaxed(int i, atomic_t *v) #else /* atomic_fetch_andnot_relaxed */ #ifndef atomic_fetch_andnot_acquire -static inline int +static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { int ret = atomic_fetch_andnot_relaxed(i, v); @@ -711,7 +713,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_release -static inline int +static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -721,7 +723,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot -static inline int +static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { int ret; @@ -742,7 +744,7 @@ atomic_fetch_andnot(int i, atomic_t *v) #else /* atomic_fetch_or_relaxed */ #ifndef atomic_fetch_or_acquire -static inline int +static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { int ret = atomic_fetch_or_relaxed(i, v); @@ -753,7 +755,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_or_release -static inline int +static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -763,7 +765,7 @@ atomic_fetch_or_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_or -static inline int +static __always_inline int atomic_fetch_or(int i, atomic_t *v) { int ret; @@ -784,7 +786,7 @@ atomic_fetch_or(int i, atomic_t *v) #else /* atomic_fetch_xor_relaxed */ #ifndef atomic_fetch_xor_acquire -static inline int +static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { int ret = atomic_fetch_xor_relaxed(i, v); @@ -795,7 +797,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_xor_release -static inline int +static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -805,7 +807,7 @@ atomic_fetch_xor_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_xor -static inline int +static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { int ret; @@ -826,7 +828,7 @@ atomic_fetch_xor(int i, atomic_t *v) #else /* atomic_xchg_relaxed */ #ifndef atomic_xchg_acquire -static inline int +static __always_inline int atomic_xchg_acquire(atomic_t *v, int i) { int ret = atomic_xchg_relaxed(v, i); @@ -837,7 +839,7 @@ atomic_xchg_acquire(atomic_t *v, int i) #endif #ifndef atomic_xchg_release -static inline int +static __always_inline int atomic_xchg_release(atomic_t *v, int i) { __atomic_release_fence(); @@ -847,7 +849,7 @@ atomic_xchg_release(atomic_t *v, int i) #endif #ifndef atomic_xchg -static inline int +static __always_inline int atomic_xchg(atomic_t *v, int i) { int ret; @@ -868,7 +870,7 @@ atomic_xchg(atomic_t *v, int i) #else /* atomic_cmpxchg_relaxed */ #ifndef atomic_cmpxchg_acquire -static inline int +static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { int ret = atomic_cmpxchg_relaxed(v, old, new); @@ -879,7 +881,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) #endif #ifndef atomic_cmpxchg_release -static inline int +static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { __atomic_release_fence(); @@ -889,7 +891,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new) #endif #ifndef atomic_cmpxchg -static inline int +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { int ret; @@ -911,7 +913,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new) #endif /* atomic_try_cmpxchg */ #ifndef atomic_try_cmpxchg -static inline bool +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { int r, o = *old; @@ -924,7 +926,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { int r, o = *old; @@ -937,7 +939,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_release -static inline bool +static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { int r, o = *old; @@ -950,7 +952,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_relaxed -static inline bool +static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { int r, o = *old; @@ -965,7 +967,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) #else /* atomic_try_cmpxchg_relaxed */ #ifndef atomic_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { bool ret = atomic_try_cmpxchg_relaxed(v, old, new); @@ -976,7 +978,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_release -static inline bool +static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { __atomic_release_fence(); @@ -986,7 +988,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg -static inline bool +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { bool ret; @@ -1010,7 +1012,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) * true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { return atomic_sub_return(i, v) == 0; @@ -1027,7 +1029,7 @@ atomic_sub_and_test(int i, atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline bool +static __always_inline bool atomic_dec_and_test(atomic_t *v) { return atomic_dec_return(v) == 0; @@ -1044,7 +1046,7 @@ atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic_inc_and_test(atomic_t *v) { return atomic_inc_return(v) == 0; @@ -1062,7 +1064,7 @@ atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool +static __always_inline bool atomic_add_negative(int i, atomic_t *v) { return atomic_add_return(i, v) < 0; @@ -1080,7 +1082,7 @@ atomic_add_negative(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ -static inline int +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); @@ -1105,7 +1107,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ -static inline bool +static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { return atomic_fetch_add_unless(v, a, u) != u; @@ -1121,7 +1123,7 @@ atomic_add_unless(atomic_t *v, int a, int u) * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ -static inline bool +static __always_inline bool atomic_inc_not_zero(atomic_t *v) { return atomic_add_unless(v, 1, 0); @@ -1130,7 +1132,7 @@ atomic_inc_not_zero(atomic_t *v) #endif #ifndef atomic_inc_unless_negative -static inline bool +static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { int c = atomic_read(v); @@ -1146,7 +1148,7 @@ atomic_inc_unless_negative(atomic_t *v) #endif #ifndef atomic_dec_unless_positive -static inline bool +static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { int c = atomic_read(v); @@ -1162,7 +1164,7 @@ atomic_dec_unless_positive(atomic_t *v) #endif #ifndef atomic_dec_if_positive -static inline int +static __always_inline int atomic_dec_if_positive(atomic_t *v) { int dec, c = atomic_read(v); @@ -1186,7 +1188,7 @@ atomic_dec_if_positive(atomic_t *v) #endif #ifndef atomic64_read_acquire -static inline s64 +static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { return smp_load_acquire(&(v)->counter); @@ -1195,7 +1197,7 @@ atomic64_read_acquire(const atomic64_t *v) #endif #ifndef atomic64_set_release -static inline void +static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { smp_store_release(&(v)->counter, i); @@ -1210,7 +1212,7 @@ atomic64_set_release(atomic64_t *v, s64 i) #else /* atomic64_add_return_relaxed */ #ifndef atomic64_add_return_acquire -static inline s64 +static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_add_return_relaxed(i, v); @@ -1221,7 +1223,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_add_return_release -static inline s64 +static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1231,7 +1233,7 @@ atomic64_add_return_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_add_return -static inline s64 +static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { s64 ret; @@ -1252,7 +1254,7 @@ atomic64_add_return(s64 i, atomic64_t *v) #else /* atomic64_fetch_add_relaxed */ #ifndef atomic64_fetch_add_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_add_relaxed(i, v); @@ -1263,7 +1265,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_add_release -static inline s64 +static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1273,7 +1275,7 @@ atomic64_fetch_add_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_add -static inline s64 +static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { s64 ret; @@ -1294,7 +1296,7 @@ atomic64_fetch_add(s64 i, atomic64_t *v) #else /* atomic64_sub_return_relaxed */ #ifndef atomic64_sub_return_acquire -static inline s64 +static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_sub_return_relaxed(i, v); @@ -1305,7 +1307,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_sub_return_release -static inline s64 +static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1315,7 +1317,7 @@ atomic64_sub_return_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_sub_return -static inline s64 +static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { s64 ret; @@ -1336,7 +1338,7 @@ atomic64_sub_return(s64 i, atomic64_t *v) #else /* atomic64_fetch_sub_relaxed */ #ifndef atomic64_fetch_sub_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_sub_relaxed(i, v); @@ -1347,7 +1349,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_sub_release -static inline s64 +static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1357,7 +1359,7 @@ atomic64_fetch_sub_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_sub -static inline s64 +static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { s64 ret; @@ -1372,7 +1374,7 @@ atomic64_fetch_sub(s64 i, atomic64_t *v) #endif /* atomic64_fetch_sub_relaxed */ #ifndef atomic64_inc -static inline void +static __always_inline void atomic64_inc(atomic64_t *v) { atomic64_add(1, v); @@ -1388,7 +1390,7 @@ atomic64_inc(atomic64_t *v) #endif /* atomic64_inc_return */ #ifndef atomic64_inc_return -static inline s64 +static __always_inline s64 atomic64_inc_return(atomic64_t *v) { return atomic64_add_return(1, v); @@ -1397,7 +1399,7 @@ atomic64_inc_return(atomic64_t *v) #endif #ifndef atomic64_inc_return_acquire -static inline s64 +static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { return atomic64_add_return_acquire(1, v); @@ -1406,7 +1408,7 @@ atomic64_inc_return_acquire(atomic64_t *v) #endif #ifndef atomic64_inc_return_release -static inline s64 +static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { return atomic64_add_return_release(1, v); @@ -1415,7 +1417,7 @@ atomic64_inc_return_release(atomic64_t *v) #endif #ifndef atomic64_inc_return_relaxed -static inline s64 +static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { return atomic64_add_return_relaxed(1, v); @@ -1426,7 +1428,7 @@ atomic64_inc_return_relaxed(atomic64_t *v) #else /* atomic64_inc_return_relaxed */ #ifndef atomic64_inc_return_acquire -static inline s64 +static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { s64 ret = atomic64_inc_return_relaxed(v); @@ -1437,7 +1439,7 @@ atomic64_inc_return_acquire(atomic64_t *v) #endif #ifndef atomic64_inc_return_release -static inline s64 +static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { __atomic_release_fence(); @@ -1447,7 +1449,7 @@ atomic64_inc_return_release(atomic64_t *v) #endif #ifndef atomic64_inc_return -static inline s64 +static __always_inline s64 atomic64_inc_return(atomic64_t *v) { s64 ret; @@ -1469,7 +1471,7 @@ atomic64_inc_return(atomic64_t *v) #endif /* atomic64_fetch_inc */ #ifndef atomic64_fetch_inc -static inline s64 +static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { return atomic64_fetch_add(1, v); @@ -1478,7 +1480,7 @@ atomic64_fetch_inc(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { return atomic64_fetch_add_acquire(1, v); @@ -1487,7 +1489,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_release -static inline s64 +static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { return atomic64_fetch_add_release(1, v); @@ -1496,7 +1498,7 @@ atomic64_fetch_inc_release(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { return atomic64_fetch_add_relaxed(1, v); @@ -1507,7 +1509,7 @@ atomic64_fetch_inc_relaxed(atomic64_t *v) #else /* atomic64_fetch_inc_relaxed */ #ifndef atomic64_fetch_inc_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { s64 ret = atomic64_fetch_inc_relaxed(v); @@ -1518,7 +1520,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_release -static inline s64 +static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { __atomic_release_fence(); @@ -1528,7 +1530,7 @@ atomic64_fetch_inc_release(atomic64_t *v) #endif #ifndef atomic64_fetch_inc -static inline s64 +static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { s64 ret; @@ -1543,7 +1545,7 @@ atomic64_fetch_inc(atomic64_t *v) #endif /* atomic64_fetch_inc_relaxed */ #ifndef atomic64_dec -static inline void +static __always_inline void atomic64_dec(atomic64_t *v) { atomic64_sub(1, v); @@ -1559,7 +1561,7 @@ atomic64_dec(atomic64_t *v) #endif /* atomic64_dec_return */ #ifndef atomic64_dec_return -static inline s64 +static __always_inline s64 atomic64_dec_return(atomic64_t *v) { return atomic64_sub_return(1, v); @@ -1568,7 +1570,7 @@ atomic64_dec_return(atomic64_t *v) #endif #ifndef atomic64_dec_return_acquire -static inline s64 +static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { return atomic64_sub_return_acquire(1, v); @@ -1577,7 +1579,7 @@ atomic64_dec_return_acquire(atomic64_t *v) #endif #ifndef atomic64_dec_return_release -static inline s64 +static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { return atomic64_sub_return_release(1, v); @@ -1586,7 +1588,7 @@ atomic64_dec_return_release(atomic64_t *v) #endif #ifndef atomic64_dec_return_relaxed -static inline s64 +static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { return atomic64_sub_return_relaxed(1, v); @@ -1597,7 +1599,7 @@ atomic64_dec_return_relaxed(atomic64_t *v) #else /* atomic64_dec_return_relaxed */ #ifndef atomic64_dec_return_acquire -static inline s64 +static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { s64 ret = atomic64_dec_return_relaxed(v); @@ -1608,7 +1610,7 @@ atomic64_dec_return_acquire(atomic64_t *v) #endif #ifndef atomic64_dec_return_release -static inline s64 +static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { __atomic_release_fence(); @@ -1618,7 +1620,7 @@ atomic64_dec_return_release(atomic64_t *v) #endif #ifndef atomic64_dec_return -static inline s64 +static __always_inline s64 atomic64_dec_return(atomic64_t *v) { s64 ret; @@ -1640,7 +1642,7 @@ atomic64_dec_return(atomic64_t *v) #endif /* atomic64_fetch_dec */ #ifndef atomic64_fetch_dec -static inline s64 +static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { return atomic64_fetch_sub(1, v); @@ -1649,7 +1651,7 @@ atomic64_fetch_dec(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { return atomic64_fetch_sub_acquire(1, v); @@ -1658,7 +1660,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_release -static inline s64 +static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { return atomic64_fetch_sub_release(1, v); @@ -1667,7 +1669,7 @@ atomic64_fetch_dec_release(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { return atomic64_fetch_sub_relaxed(1, v); @@ -1678,7 +1680,7 @@ atomic64_fetch_dec_relaxed(atomic64_t *v) #else /* atomic64_fetch_dec_relaxed */ #ifndef atomic64_fetch_dec_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { s64 ret = atomic64_fetch_dec_relaxed(v); @@ -1689,7 +1691,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_release -static inline s64 +static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { __atomic_release_fence(); @@ -1699,7 +1701,7 @@ atomic64_fetch_dec_release(atomic64_t *v) #endif #ifndef atomic64_fetch_dec -static inline s64 +static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { s64 ret; @@ -1720,7 +1722,7 @@ atomic64_fetch_dec(atomic64_t *v) #else /* atomic64_fetch_and_relaxed */ #ifndef atomic64_fetch_and_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_and_relaxed(i, v); @@ -1731,7 +1733,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_and_release -static inline s64 +static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1741,7 +1743,7 @@ atomic64_fetch_and_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_and -static inline s64 +static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { s64 ret; @@ -1756,7 +1758,7 @@ atomic64_fetch_and(s64 i, atomic64_t *v) #endif /* atomic64_fetch_and_relaxed */ #ifndef atomic64_andnot -static inline void +static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { atomic64_and(~i, v); @@ -1772,7 +1774,7 @@ atomic64_andnot(s64 i, atomic64_t *v) #endif /* atomic64_fetch_andnot */ #ifndef atomic64_fetch_andnot -static inline s64 +static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { return atomic64_fetch_and(~i, v); @@ -1781,7 +1783,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { return atomic64_fetch_and_acquire(~i, v); @@ -1790,7 +1792,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_release -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { return atomic64_fetch_and_release(~i, v); @@ -1799,7 +1801,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { return atomic64_fetch_and_relaxed(~i, v); @@ -1810,7 +1812,7 @@ atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) #else /* atomic64_fetch_andnot_relaxed */ #ifndef atomic64_fetch_andnot_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_andnot_relaxed(i, v); @@ -1821,7 +1823,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_release -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1831,7 +1833,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot -static inline s64 +static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { s64 ret; @@ -1852,7 +1854,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #else /* atomic64_fetch_or_relaxed */ #ifndef atomic64_fetch_or_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_or_relaxed(i, v); @@ -1863,7 +1865,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_or_release -static inline s64 +static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1873,7 +1875,7 @@ atomic64_fetch_or_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_or -static inline s64 +static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { s64 ret; @@ -1894,7 +1896,7 @@ atomic64_fetch_or(s64 i, atomic64_t *v) #else /* atomic64_fetch_xor_relaxed */ #ifndef atomic64_fetch_xor_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_xor_relaxed(i, v); @@ -1905,7 +1907,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_xor_release -static inline s64 +static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1915,7 +1917,7 @@ atomic64_fetch_xor_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_xor -static inline s64 +static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 ret; @@ -1936,7 +1938,7 @@ atomic64_fetch_xor(s64 i, atomic64_t *v) #else /* atomic64_xchg_relaxed */ #ifndef atomic64_xchg_acquire -static inline s64 +static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 i) { s64 ret = atomic64_xchg_relaxed(v, i); @@ -1947,7 +1949,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i) #endif #ifndef atomic64_xchg_release -static inline s64 +static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { __atomic_release_fence(); @@ -1957,7 +1959,7 @@ atomic64_xchg_release(atomic64_t *v, s64 i) #endif #ifndef atomic64_xchg -static inline s64 +static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { s64 ret; @@ -1978,7 +1980,7 @@ atomic64_xchg(atomic64_t *v, s64 i) #else /* atomic64_cmpxchg_relaxed */ #ifndef atomic64_cmpxchg_acquire -static inline s64 +static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { s64 ret = atomic64_cmpxchg_relaxed(v, old, new); @@ -1989,7 +1991,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) #endif #ifndef atomic64_cmpxchg_release -static inline s64 +static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { __atomic_release_fence(); @@ -1999,7 +2001,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) #endif #ifndef atomic64_cmpxchg -static inline s64 +static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { s64 ret; @@ -2021,7 +2023,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) #endif /* atomic64_try_cmpxchg */ #ifndef atomic64_try_cmpxchg -static inline bool +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2034,7 +2036,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2047,7 +2049,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_release -static inline bool +static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2060,7 +2062,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_relaxed -static inline bool +static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2075,7 +2077,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) #else /* atomic64_try_cmpxchg_relaxed */ #ifndef atomic64_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { bool ret = atomic64_try_cmpxchg_relaxed(v, old, new); @@ -2086,7 +2088,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_release -static inline bool +static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { __atomic_release_fence(); @@ -2096,7 +2098,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg -static inline bool +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { bool ret; @@ -2120,7 +2122,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) * true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { return atomic64_sub_return(i, v) == 0; @@ -2137,7 +2139,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline bool +static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { return atomic64_dec_return(v) == 0; @@ -2154,7 +2156,7 @@ atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { return atomic64_inc_return(v) == 0; @@ -2172,7 +2174,7 @@ atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool +static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { return atomic64_add_return(i, v) < 0; @@ -2190,7 +2192,7 @@ atomic64_add_negative(s64 i, atomic64_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ -static inline s64 +static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { s64 c = atomic64_read(v); @@ -2215,7 +2217,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ -static inline bool +static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { return atomic64_fetch_add_unless(v, a, u) != u; @@ -2231,7 +2233,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ -static inline bool +static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { return atomic64_add_unless(v, 1, 0); @@ -2240,7 +2242,7 @@ atomic64_inc_not_zero(atomic64_t *v) #endif #ifndef atomic64_inc_unless_negative -static inline bool +static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { s64 c = atomic64_read(v); @@ -2256,7 +2258,7 @@ atomic64_inc_unless_negative(atomic64_t *v) #endif #ifndef atomic64_dec_unless_positive -static inline bool +static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { s64 c = atomic64_read(v); @@ -2272,7 +2274,7 @@ atomic64_dec_unless_positive(atomic64_t *v) #endif #ifndef atomic64_dec_if_positive -static inline s64 +static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { s64 dec, c = atomic64_read(v); @@ -2292,4 +2294,4 @@ atomic64_dec_if_positive(atomic64_t *v) #define atomic64_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c)) #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 25de4a2804d70f57e994fe3b419148658bb5378a +// baaf45f4c24ed88ceae58baca39d7fd80bb8101b diff --git a/scripts/atomic/fallbacks/acquire b/scripts/atomic/fallbacks/acquire index e38871e64db6..ea489acc285e 100755 --- a/scripts/atomic/fallbacks/acquire +++ b/scripts/atomic/fallbacks/acquire @@ -1,5 +1,5 @@ cat <counter); diff --git a/scripts/atomic/fallbacks/release b/scripts/atomic/fallbacks/release index 3f628a3802d9..730d2a6d3e07 100755 --- a/scripts/atomic/fallbacks/release +++ b/scripts/atomic/fallbacks/release @@ -1,5 +1,5 @@ cat <counter, i); diff --git a/scripts/atomic/fallbacks/sub_and_test b/scripts/atomic/fallbacks/sub_and_test index 289ef17a2d7a..6cfe4ed49746 100755 --- a/scripts/atomic/fallbacks/sub_and_test +++ b/scripts/atomic/fallbacks/sub_and_test @@ -8,7 +8,7 @@ cat < + EOF for xchg in "xchg" "cmpxchg" "cmpxchg64"; do -- cgit v1.2.1 From 7161177481d521e725a7bc6c9308ac2968fee038 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 12 Dec 2019 01:07:08 +0100 Subject: kcsan: Document static blacklisting options Updates the section on "Selective analysis", listing all available options to blacklist reporting data races for: specific accesses, functions, compilation units, and entire directories. These options should provide adequate control for maintainers to opt out of KCSAN analysis at varying levels of granularity. It is hoped to provide the required control to reflect preferences for handling data races across the kernel. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index a6f4f92df2fa..65a0be513b7d 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -101,18 +101,28 @@ instrumentation or e.g. DMA accesses. Selective analysis ~~~~~~~~~~~~~~~~~~ -To disable KCSAN data race detection for an entire subsystem, add to the -respective ``Makefile``:: +It may be desirable to disable data race detection for specific accesses, +functions, compilation units, or entire subsystems. For static blacklisting, +the below options are available: - KCSAN_SANITIZE := n +* KCSAN understands the ``data_race(expr)`` annotation, which tells KCSAN that + any data races due to accesses in ``expr`` should be ignored and resulting + behaviour when encountering a data race is deemed safe. + +* Disabling data race detection for entire functions can be accomplished by + using the function attribute ``__no_kcsan`` (or ``__no_kcsan_or_inline`` for + ``__always_inline`` functions). To dynamically control for which functions + data races are reported, see the `debugfs`_ blacklist/whitelist feature. -To disable KCSAN on a per-file basis, add to the ``Makefile``:: +* To disable data race detection for a particular compilation unit, add to the + ``Makefile``:: KCSAN_SANITIZE_file.o := n -KCSAN also understands the ``data_race(expr)`` annotation, which tells KCSAN -that any data races due to accesses in ``expr`` should be ignored and resulting -behaviour when encountering a data race is deemed safe. +* To disable data race detection for all compilation units listed in a + ``Makefile``, add to the respective ``Makefile``:: + + KCSAN_SANITIZE := n debugfs ~~~~~~~ -- cgit v1.2.1 From e33f9a169747880a008dd5e7b934fc592e91cd63 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 12 Dec 2019 01:07:09 +0100 Subject: kcsan: Add __no_kcsan function attribute Since the use of -fsanitize=thread is an implementation detail of KCSAN, the name __no_sanitize_thread could be misleading if used widely. Instead, we introduce the __no_kcsan attribute which is shorter and more accurate in the context of KCSAN. This matches the attribute name __no_kcsan_or_inline. The use of __kcsan_or_inline itself is still required for __always_inline functions to retain compatibility with older compilers. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/compiler-gcc.h | 3 +-- include/linux/compiler.h | 7 +++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 0eb2a1cc411d..cf294faec2f8 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -146,8 +146,7 @@ #endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread \ - __attribute__((__noinline__)) __attribute__((no_sanitize_thread)) +#define __no_sanitize_thread __attribute__((no_sanitize_thread)) #else #define __no_sanitize_thread #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ad8c76144a3c..8c0beb10c1dd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -207,12 +207,15 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define __no_kasan_or_inline __always_inline #endif +#define __no_kcsan __no_sanitize_thread #ifdef __SANITIZE_THREAD__ /* * Rely on __SANITIZE_THREAD__ instead of CONFIG_KCSAN, to avoid not inlining in - * compilation units where instrumentation is disabled. + * compilation units where instrumentation is disabled. The attribute 'noinline' + * is required for older compilers, where implicit inlining of very small + * functions renders __no_sanitize_thread ineffective. */ -# define __no_kcsan_or_inline __no_sanitize_thread notrace __maybe_unused +# define __no_kcsan_or_inline __no_kcsan noinline notrace __maybe_unused # define __no_sanitize_or_inline __no_kcsan_or_inline #else # define __no_kcsan_or_inline __always_inline -- cgit v1.2.1 From 860c8802ace14c646864795e057349c9fb2d60ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Nov 2019 09:42:13 -0800 Subject: rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls Eric Dumazet supplied a KCSAN report of a bug that forces use of hlist_unhashed_lockless() from sk_unhashed(): ------------------------------------------------------------------------ BUG: KCSAN: data-race in inet_unhash / inet_unhash write to 0xffff8880a69a0170 of 8 bytes by interrupt on cpu 1: __hlist_nulls_del include/linux/list_nulls.h:88 [inline] hlist_nulls_del_init_rcu include/linux/rculist_nulls.h:36 [inline] __sk_nulls_del_node_init_rcu include/net/sock.h:676 [inline] inet_unhash+0x38f/0x4a0 net/ipv4/inet_hashtables.c:612 tcp_set_state+0xfa/0x3e0 net/ipv4/tcp.c:2249 tcp_done+0x93/0x1e0 net/ipv4/tcp.c:3854 tcp_write_err+0x7e/0xc0 net/ipv4/tcp_timer.c:56 tcp_retransmit_timer+0x9b8/0x16d0 net/ipv4/tcp_timer.c:479 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:599 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:619 call_timer_fn+0x5f/0x2f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0xc0c/0xcd0 kernel/time/timer.c:1786 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 read to 0xffff8880a69a0170 of 8 bytes by interrupt on cpu 0: sk_unhashed include/net/sock.h:607 [inline] inet_unhash+0x3d/0x4a0 net/ipv4/inet_hashtables.c:592 tcp_set_state+0xfa/0x3e0 net/ipv4/tcp.c:2249 tcp_done+0x93/0x1e0 net/ipv4/tcp.c:3854 tcp_write_err+0x7e/0xc0 net/ipv4/tcp_timer.c:56 tcp_retransmit_timer+0x9b8/0x16d0 net/ipv4/tcp_timer.c:479 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:599 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:619 call_timer_fn+0x5f/0x2f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0xc0c/0xcd0 kernel/time/timer.c:1786 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ------------------------------------------------------------------------ This commit therefore replaces C-language assignments with WRITE_ONCE() in include/linux/list_nulls.h and include/linux/rculist_nulls.h. Reported-by: Eric Dumazet # For KCSAN Signed-off-by: Paul E. McKenney --- include/linux/list_nulls.h | 8 ++++---- include/linux/rculist_nulls.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 3ef96743db8d..1ecd35664e0d 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -72,10 +72,10 @@ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) @@ -85,13 +85,13 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n) WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) - next->pprev = pprev; + WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index bc8206a8f30e..517a06f36c7a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -34,7 +34,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); - n->pprev = NULL; + WRITE_ONCE(n->pprev, NULL); } } @@ -66,7 +66,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** @@ -94,10 +94,10 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } /** -- cgit v1.2.1 From 46deb7449d99f37bebf5cbd7f95c136c6fafeaa5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Nov 2019 10:35:13 -0800 Subject: rcu: Add and update docbook header comments in list.h [ paulmck: Fix typo found by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- include/linux/list.h | 112 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 17 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 61f5aaf96192..4f3b7f71bdfd 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -23,6 +23,13 @@ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) +/** + * INIT_LIST_HEAD - Initialize a list_head structure + * @list: list_head structure to be initialized. + * + * Initializes the list_head to point to itself. If it is a list header, + * the result is an empty list. + */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); @@ -120,12 +127,6 @@ static inline void __list_del_clearprev(struct list_head *entry) entry->prev = NULL; } -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty() on entry does not return true after this, the entry is - * in an undefined state. - */ static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) @@ -134,6 +135,12 @@ static inline void __list_del_entry(struct list_head *entry) __list_del(entry->prev, entry->next); } +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); @@ -157,8 +164,15 @@ static inline void list_replace(struct list_head *old, new->prev->next = new; } +/** + * list_replace_init - replace old entry by new one and initialize the old one + * @old : the element to be replaced + * @new : the new element to insert + * + * If @old was empty, it will be overwritten. + */ static inline void list_replace_init(struct list_head *old, - struct list_head *new) + struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); @@ -744,21 +758,36 @@ static inline void INIT_HLIST_NODE(struct hlist_node *h) h->pprev = NULL; } +/** + * hlist_unhashed - Has node been removed from list and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed + * state. For example, hlist_nulls_del_init_rcu() does leave the + * node in unhashed state, but hlist_nulls_del() does not. + */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } -/* This variant of hlist_unhashed() must be used in lockless contexts - * to avoid potential load-tearing. - * The READ_ONCE() is paired with the various WRITE_ONCE() in hlist - * helpers that are defined below. +/** + * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use + * @h: Node to be checked + * + * This variant of hlist_unhashed() must be used in lockless contexts + * to avoid potential load-tearing. The READ_ONCE() is paired with the + * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } +/** + * hlist_empty - Is the specified hlist_head structure an empty hlist? + * @h: Structure to check. + */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); @@ -774,6 +803,13 @@ static inline void __hlist_del(struct hlist_node *n) WRITE_ONCE(next->pprev, pprev); } +/** + * hlist_del - Delete the specified hlist_node from its list + * @n: Node to delete. + * + * Note that this function leaves the node in hashed state. Use + * hlist_del_init() or similar instead to unhash @n. + */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); @@ -781,6 +817,12 @@ static inline void hlist_del(struct hlist_node *n) n->pprev = LIST_POISON2; } +/** + * hlist_del_init - Delete the specified hlist_node from its list and initialize + * @n: Node to delete. + * + * Note that this function leaves the node in unhashed state. + */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { @@ -789,6 +831,14 @@ static inline void hlist_del_init(struct hlist_node *n) } } +/** + * hlist_add_head - add a new entry at the beginning of the hlist + * @n: new entry to be added + * @h: hlist head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; @@ -799,9 +849,13 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) WRITE_ONCE(n->pprev, &h->first); } -/* next must be != NULL */ +/** + * hlist_add_before - add a new entry before the one specified + * @n: new entry to be added + * @next: hlist node to add it before, which must be non-NULL + */ static inline void hlist_add_before(struct hlist_node *n, - struct hlist_node *next) + struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); @@ -809,6 +863,11 @@ static inline void hlist_add_before(struct hlist_node *n, WRITE_ONCE(*(n->pprev), n); } +/** + * hlist_add_behing - add a new entry after the one specified + * @n: new entry to be added + * @prev: hlist node to add it after, which must be non-NULL + */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { @@ -820,20 +879,35 @@ static inline void hlist_add_behind(struct hlist_node *n, WRITE_ONCE(n->next->pprev, &n->next); } -/* after that we'll appear to be on some hlist and hlist_del will work */ +/** + * hlist_add_fake - create a fake hlist consisting of a single headless node + * @n: Node to make a fake list out of + * + * This makes @n appear to be its own predecessor on a headless hlist. + * The point of this is to allow things like hlist_del() to work correctly + * in cases where there is no list. + */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } +/** + * hlist_fake: Is this node a fake hlist? + * @h: Node to check for being a self-referential fake hlist. + */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } -/* +/** + * hlist_is_singular_node - is node the only element of the specified hlist? + * @n: Node to check for singularity. + * @h: Header for potentially singular list. + * * Check whether the node is the only node of the head without - * accessing head: + * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) @@ -841,7 +915,11 @@ hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) return !n->next && n->pprev == &h->first; } -/* +/** + * hlist_move_list - Move an hlist + * @old: hlist_head for old list. + * @new: hlist_head for new list. + * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ -- cgit v1.2.1 From 02b99b38f3d96c77cf0a368d99952aa372dfe58a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Nov 2019 10:45:47 -0800 Subject: rcu: Add a hlist_nulls_unhashed_lockless() function This commit adds an hlist_nulls_unhashed_lockless() to allow lockless checking for whether or note an hlist_nulls_node is hashed or not. While in the area, this commit also adds a docbook comment to the existing hlist_nulls_unhashed() function. Signed-off-by: Paul E. McKenney --- include/linux/list_nulls.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 1ecd35664e0d..fa6e8471bd22 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -56,11 +56,33 @@ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) return ((unsigned long)ptr) >> 1; } +/** + * hlist_nulls_unhashed - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. + */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } +/** + * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this + * function may be used locklessly. + */ +static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) +{ + return !READ_ONCE(h->pprev); +} + static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); -- cgit v1.2.1 From 7f5d51e26a471f771b8dae1b9ef417f5fd5e9c85 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 5 Dec 2019 11:46:49 +0530 Subject: rculist_nulls: Add docbook comments This patch adds docbook comment headers for hlist_nulls_first_rcu() and hlist_nulls_next_rcu() in rculist_nulls.h. Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- include/linux/rculist_nulls.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 517a06f36c7a..25952c4f83b0 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -38,9 +38,17 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) } } +/** + * hlist_nulls_first_rcu - returns the first element of the hash list. + * @head: the head of the list. + */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) +/** + * hlist_nulls_next_rcu - returns the element of the list after @node. + * @node: element of the list. + */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) -- cgit v1.2.1 From 459b5287066f53c4b91569c070780a540de90b85 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Fri, 6 Dec 2019 00:23:52 +0530 Subject: rculist_nulls: Change docbook comment headers This patch changes the docbook comment "head for your list" to "head of the list". Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- include/linux/rculist_nulls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 25952c4f83b0..409a86bb5f25 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -112,7 +112,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. - * @head: the head for your list. + * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], @@ -132,7 +132,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. - * @head: the head for your list. + * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ -- cgit v1.2.1 From afa47fdfa29ffd3324e7b89551d1a6e54ccc042b Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 9 Dec 2019 13:20:43 +0530 Subject: rculist.h: Add list_tail_rcu() This patch adds the macro list_tail_rcu() and documents it. Signed-off-by: Madhuparna Bhowmik [ paulmck: Reword a bit. ] Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4b7ae1bf50b3..9f313e4999fe 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -40,6 +40,16 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/** + * list_tail_rcu - returns the prev pointer of the head of the list + * @head: the head of the list + * + * Note: This should only be used with the list header, and even then + * only if list_del() and similar primitives are not also used on the + * list header. + */ +#define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) + /* * Check during list traversal that we are within an RCU reader */ -- cgit v1.2.1 From eae2797aae730ab5b478dd19d775bf679ea33c97 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 10 Dec 2019 11:16:39 +0530 Subject: nfs: Fix nfs_access_get_cached_rcu() sparse error This patch fixes the following sparse error: fs/nfs/dir.c:2353:14: error: incompatible types in comparison expression (different address spaces): fs/nfs/dir.c:2353:14: struct list_head [noderef] * fs/nfs/dir.c:2353:14: struct list_head * Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e180033e35cf..b69370b6d317 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2350,7 +2350,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; - lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || cred != cache->cred) -- cgit v1.2.1 From 5d909830028f1c961015737877e367429340d7c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Nov 2019 18:54:06 -0800 Subject: rcutorture: Suppress forward-progress complaints during early boot Some larger systems can take in excess of 50 seconds to complete their early boot initcalls prior to spawing init. This does not in any way help the forward-progress judgments of built-in rcutorture (when rcutorture is built as a module, the insmod or modprobe command normally cannot happen until some time after boot completes). This commit therefore suppresses such complaints until about the time that init is spawned. This also includes a fix to a stupid error located by kbuild test robot. [ paulmck: Apply kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/rcutorture.c | 3 ++- kernel/rcu/tree_exp.h | 1 - kernel/rcu/update.c | 12 ++++++++++++ 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b2b2dc990da9..045c28b71f4f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,7 @@ void rcu_scheduler_starting(void); static inline void rcu_scheduler_starting(void) { } #endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } +static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 2f787b9029d1..45f3f66bb04d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -54,6 +54,7 @@ void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); +bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..9ba49788cb48 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1067,7 +1067,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6935a9e2b094..dcbd75791f39 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -508,7 +508,6 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } - WARN_ON_ONCE(1); } for (;;) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..feaaec5747a3 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = 1; +} + +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; } +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); #endif /* #ifndef CONFIG_TINY_RCU */ -- cgit v1.2.1 From 573c3c462571ed1237487c05046d0ae8d03a0511 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Dec 2019 13:24:07 -0800 Subject: torture: Make results-directory date format completion-friendly The names of the per-test results directories are of the form 2019.11.29-20:42:19. This works, but the ":" characters make tab-based shell name completion a bit onerous because the user must remember to include a quote character somewhere before the first ":". This commit therefore changes the ":" characters to periods, as in 2019.12.01-20.48.01", which allows tab-based completion to work more naturally. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 78d18ab8e954..2315e2ec12d6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -39,7 +39,7 @@ TORTURE_TRUST_MAKE="" resdir="" configs="" cpus=0 -ds=`date +%Y.%m.%d-%H:%M:%S` +ds=`date +%Y.%m.%d-%H.%M.%S` jitter="-1" usage () { -- cgit v1.2.1 From ed45bf002bf99b4ca4b203f42649383dee02db78 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Dec 2019 15:58:41 -0800 Subject: rcutorture: Refrain from callback flooding during boot Additional rcutorture aggression can result in, believe it or not, boot times in excess of three minutes on large hyperthreaded systems. This is long enough for rcutorture to decide to do some callback flooding, which seems a bit excessive given that userspace cannot have started until long after boot, and it is userspace that does the real-world callback flooding. Worse yet, because Tiny RCU lacks forward-progress functionality, the looping-in-the-kernel tests can also be problematic during early boot. This commit therefore causes rcutorture to hold off on callback flooding until about the time that init is spawned, and the same for looping-in-the-kernel tests for Tiny RCU. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9ba49788cb48..08fa4ef23914 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1994,8 +1994,11 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - rcu_torture_fwd_prog_cr(rfp); + if (!IS_ENABLED(CONFIG_TINY_RCU) || + rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + if (rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ -- cgit v1.2.1 From 90ca882e4455e1d40c9b7e689a1933013bd478cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 10:49:11 -0800 Subject: torture: Forgive -EBUSY from boottime CPU-hotplug operations During boot, CPU hotplug is often disabled, for example by PCI probing. On large systems that take substantial time to boot, this can result in spurious RCU_HOTPLUG errors. This commit therefore forgives any boottime -EBUSY CPU-hotplug failures by adjusting counters to pretend that the corresponding attempt never happened. A non-splat record of the failed attempt is emitted to the console with the added string "(-EBUSY forgiven during boot)". Signed-off-by: Paul E. McKenney --- kernel/torture.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 7c13f5558b71..e377b5b17de8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -84,6 +84,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -99,10 +100,16 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, (*n_offl_attempts)++; ret = cpu_down(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_offl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: offline %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -137,6 +144,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -150,10 +158,16 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, (*n_onl_attempts)++; ret = cpu_up(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_onl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: online %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG -- cgit v1.2.1 From e10dcaa943dbcb60a3957aa3022a4a75d82c8b32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 11:29:01 -0800 Subject: rcutorture: Allow boottime stall warnings to be suppressed In normal production, an RCU CPU stall warning at boottime is often just as bad as at any other time. In fact, given the desire for fast boot, any sort of long-term stall at boot is a bad idea. However, heavy rcutorture testing on large hyperthreaded systems can generate boottime RCU CPU stalls as a matter of course. This commit therefore provides a kernel boot parameter that suppresses reporting of boottime RCU CPU stall warnings and similarly of rcutorture writer stalls. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_stall.h | 6 +++--- kernel/rcu/update.c | 8 +++++++- 6 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ed83d6d90cc3..dbe52d1c5670 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4150,6 +4150,12 @@ rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. + rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL] + Suppress RCU CPU stall warning messages and + rcutorture writer stall warnings that occur + during early boot, that is, during the time + before the init task is spawned. + rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..1779cbf33cd1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} + #define rcu_ftrace_dump_stall_suppress() \ do { \ if (!rcu_cpu_stall_suppress) \ @@ -218,6 +230,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08fa4ef23914..16c84ec182bd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1479,7 +1479,7 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_torture_current != NULL && !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..677446373b38 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -513,7 +513,7 @@ static void synchronize_rcu_expedited_wait(void) for (;;) { if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 55f9b84790d3..7ee8a1cc0d8b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -383,7 +383,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -452,7 +452,7 @@ static void print_cpu_stall(void) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -504,7 +504,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index feaaec5747a3..085f08a898fe 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -476,13 +476,19 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall +// warnings. Also used by rcutorture even if stall warnings are excluded. +int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); +module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); + #ifdef CONFIG_TASKS_RCU /* -- cgit v1.2.1 From 07c1da33e1dd5f97e7a9afcc2c653cd8eede732e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2019 15:53:28 -0800 Subject: rcutorture: Suppress boottime bad-sequence warnings In normal production, an excessively long wait on a grace period (synchronize_rcu(), for example) at boottime is often just as bad as at any other time. In fact, given the desire for fast boot, any sort of long wait at boot is a bad idea. However, heavy rcutorture testing on large hyperthreaded systems can generate such long waits during boot as a matter of course. This commit therefore causes the rcupdate.rcu_cpu_stall_suppress_at_boot kernel boot parameter to suppress reporting of bootime bad-sequence warning due to excessively long grace-period waits. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 16c84ec182bd..5efd9503df56 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1423,7 +1423,8 @@ rcu_torture_stats_print(void) pr_alert("%s%s ", torture_type, TORTURE_FLAG); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, - rcu_torture_current ? "ver" : "VER", + rcu_torture_current && !rcu_stall_is_suppressed_at_boot() + ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), -- cgit v1.2.1 From a36c9086449791d60ccac279e1feeef22fffbb73 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Dec 2019 15:02:59 -0800 Subject: torture: Allow disabling of boottime CPU-hotplug torture operations In theory, RCU-hotplug operations are supposed to work as soon as there is more than one CPU online. However, in practice, in normal production there is no way to make them happen until userspace is up and running. Besides which, on smaller systems, rcutorture doesn't start doing hotplug operations until 30 seconds after the start of boot, which on most systems also means the better part of 30 seconds after the end of boot. This commit therefore provides a new torture.disable_onoff_at_boot kernel boot parameter that suppresses CPU-hotplug torture operations until about the time that init is spawned. Of course, if you know of a need for boottime CPU-hotplug operations, then you should avoid passing this argument to any of the torture tests. You might also want to look at the splats linked to below. Link: https://lore.kernel.org/lkml/20191206185208.GA25636@paulmck-ThinkPad-P72/ Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/torture.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dbe52d1c5670..e6aba21d5ee6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4830,6 +4830,10 @@ topology updates sent by the hypervisor to this LPAR. + torture.disable_onoff_at_boot= [KNL] + Prevent the CPU-hotplug component of torturing + until after init has spawned. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/kernel/torture.c b/kernel/torture.c index e377b5b17de8..8683375dc0c7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -42,6 +42,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +static bool disable_onoff_at_boot; +module_param(disable_onoff_at_boot, bool, 0444); + static char *torture_type; static int verbose; @@ -229,6 +232,10 @@ torture_onoff(void *arg) VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { + if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { + schedule_timeout_interruptible(HZ / 10); + continue; + } cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, -- cgit v1.2.1 From df0ff706772f0c0ed8b0de71ff3a30dabb9bf80a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Dec 2019 15:19:45 -0800 Subject: rcu: Clear ->core_needs_qs at GP end or self-reported QS The rcu_data structure's ->core_needs_qs field does not necessarily get cleared in a timely fashion after the corresponding CPUs' quiescent state has been reported. From a functional viewpoint, no harm done, but this can result in excessive invocation of RCU core processing, as witnessed by the kernel test robot, which saw greatly increased softirq overhead. This commit therefore restores the rcu_report_qs_rdp() function's clearing of this field, but only when running on the corresponding CPU. Cases where some other CPU reports the quiescent state (for example, on behalf of an idle CPU) are handled by setting this field appropriately within the __note_gp_changes() function's end-of-grace-period checks. This handling is carried out regardless of whether the end of a grace period actually happened, thus handling the case where a CPU goes non-idle after a quiescent state is reported on its behalf, but before the grace period ends. This fix also avoids cross-CPU updates to ->core_needs_qs, While in the area, this commit changes the __note_gp_changes() need_gp variable's name to need_qs because it is a quiescent state that is needed from the CPU in question. Fixes: ed93dfc6bc00 ("rcu: Confine ->core_needs_qs accesses to the corresponding CPU") Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..31d01f80a1f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1386,7 +1386,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; - bool need_gp; + bool need_qs; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -1400,10 +1400,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) unlikely(READ_ONCE(rdp->gpwrap))) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ + rdp->core_needs_qs = false; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { if (!offloaded) ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ + if (rdp->core_needs_qs) + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1415,9 +1418,9 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) * go looking for one. */ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); - need_gp = !!(rnp->qsmask & rdp->grpmask); - rdp->cpu_no_qs.b.norm = need_gp; - rdp->core_needs_qs = need_gp; + need_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_qs; + rdp->core_needs_qs = need_qs; zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ @@ -1987,6 +1990,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + if (rdp->cpu == smp_processor_id()) + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { -- cgit v1.2.1 From 97eaba631bc0929bcf0d283c062a28a19ffa60ba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 11:56:10 -0700 Subject: rcu: React to callback overload by aggressively seeking quiescent states In default configutions, RCU currently waits at least 100 milliseconds before asking cond_resched() and/or resched_rcu() for help seeking quiescent states to end a grace period. But 100 milliseconds can be one good long time during an RCU callback flood, for example, as can happen when user processes repeatedly open and close files in a tight loop. These 100-millisecond gaps in successive grace periods during a callback flood can result in excessive numbers of callbacks piling up, unnecessarily increasing memory footprint. This commit therefore asks cond_resched() and/or resched_rcu() for help as early as the first FQS scan when at least one of the CPUs has more than 20,000 callbacks queued, a number that can be changed using the new rcutree.qovld kernel boot parameter. An auxiliary qovld_calc variable is used to avoid acquisition of locks that have not yet been initialized. Early tests indicate that this reduces the RCU-callback memory footprint during rcutorture floods by from 50% to 4x, depending on configuration. Reported-by: Joel Fernandes (Google) Reported-by: Tejun Heo [ paulmck: Fix bug located by Qian Cai. ] Signed-off-by: Paul E. McKenney Tested-by: Dexuan Cui Tested-by: Qian Cai --- Documentation/admin-guide/kernel-parameters.txt | 9 +++ kernel/rcu/tree.c | 75 +++++++++++++++++++++++-- kernel/rcu/tree.h | 4 ++ kernel/rcu/tree_plugin.h | 2 + 4 files changed, 86 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e6aba21d5ee6..7168e674c914 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3935,6 +3935,15 @@ Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.qovld= [KNL] + Set threshold of queued RCU callbacks beyond which + RCU's force-quiescent-state scan will aggressively + enlist help from cond_resched() and sched IPIs to + help CPUs more quickly reach quiescent states. + Set to less than zero to make this be set based + on rcutree.qhimark at boot time and to zero to + disable more aggressive help enlistment. + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 31d01f80a1f6..48fba2257748 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -410,10 +411,15 @@ static long blimit = DEFAULT_RCU_BLIMIT; static long qhimark = DEFAULT_RCU_QHIMARK; #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ +static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; @@ -1072,7 +1078,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); @@ -1089,8 +1096,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); @@ -1704,8 +1711,9 @@ static void rcu_gp_fqs_loop(void) */ static void rcu_gp_cleanup(void) { - unsigned long gp_duration; + int cpu; bool needgp = false; + unsigned long gp_duration; unsigned long new_gp_seq; bool offloaded; struct rcu_data *rdp; @@ -1751,6 +1759,12 @@ static void rcu_gp_cleanup(void) needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ needgp = rcu_future_gp_cleanup(rnp) || needgp; + // Reset overload indication for CPUs no longer overloaded + if (rcu_is_leaf_node(rnp)) + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + check_cb_ovld_locked(rdp, rnp); + } sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2299,10 +2313,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) struct rcu_data *rdp; struct rcu_node *rnp; + rcu_state.cbovld = rcu_state.cbovldnext; + rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { @@ -2583,6 +2600,48 @@ static void rcu_leak_callback(struct rcu_head *rhp) { } +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. The caller must hold the leaf rcu_node + * structure's ->lock. + */ +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (qovld_calc <= 0) + return; // Early boot and wildcard value set. + if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); + else + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); +} + +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. No locks need be held, but the + * caller must have disabled interrupts. + * + * Note that this function ignores the possibility that there are a lot + * of callbacks all of which have already seen the end of their respective + * grace periods. This omission is due to the need for no-CBs CPUs to + * be holding ->nocb_lock to do this check, which is too heavy for a + * common-case operation. + */ +static void check_cb_ovld(struct rcu_data *rdp) +{ + struct rcu_node *const rnp = rdp->mynode; + + if (qovld_calc <= 0 || + ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == + !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) + return; // Early boot wildcard value or already set correctly. + raw_spin_lock_rcu_node(rnp); + check_cb_ovld_locked(rdp, rnp); + raw_spin_unlock_rcu_node(rnp); +} + /* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify @@ -2626,6 +2685,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_init(&rdp->cblist); } + check_cb_ovld(rdp); if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ @@ -3814,6 +3874,13 @@ void __init rcu_init(void) rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); srcu_init(); + + /* Fill in default value for rcutree.qovld boot parameter. */ + /* -After- the rcu_node ->lock fields are initialized! */ + if (qovld < 0) + qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; + else + qovld_calc = qovld; } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0c87e4c161c2..9dc2ec021da5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,8 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long cbovldmask; + /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ @@ -321,6 +323,8 @@ struct rcu_state { atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ + u8 cbovld; /* Callback overload now? */ + u8 cbovldnext; /* ^ ^ next time? */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c6ea81cd4189..0be8fad08daa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (qovld != DEFAULT_RCU_QOVLD) + pr_info("\tBoot-time adjustment of callback overload leval to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) -- cgit v1.2.1 From 8ec35c101e4ad2066d69fb5b7216a4b9113458b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Nov 2019 01:10:55 -0800 Subject: rcu: React to callback overload by boosting RCU readers RCU priority boosting currently is not applied until the grace period is at least 250 milliseconds old (or the number of milliseconds specified by the CONFIG_RCU_BOOST_DELAY Kconfig option). Although this has worked well, it can result in OOM under conditions of RCU callback flooding. One can argue that the real-time systems using RCU priority boosting should carefully avoid RCU callback flooding, but one can just as well argue that an OOM is a rather obnoxious error message. This commit therefore disables the RCU priority boosting delay when there are excessive numbers of callbacks queued. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0be8fad08daa..4d4637c361b7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1079,7 +1079,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { + (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.1 From c9db5eb241bb325d540dd769d0a8ca5b2ffd562b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 12 Dec 2019 17:36:43 +0000 Subject: rcu: Fix spelling mistake "leval" -> "level" This commit fixes a spelling mistake in a pr_info() message. Signed-off-by: Colin Ian King Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4d4637c361b7..0765784012f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); if (qovld != DEFAULT_RCU_QOVLD) - pr_info("\tBoot-time adjustment of callback overload leval to %ld.\n", qovld); + pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) -- cgit v1.2.1 From fcab42ef5dfd32e2961bf443668c209ebfa1ad22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Dec 2019 11:38:57 -0800 Subject: rcu: Warn on for_each_leaf_node_cpu_mask() from non-leaf The for_each_leaf_node_cpu_mask() and for_each_leaf_node_possible_cpu() macros must be invoked only on leaf rcu_node structures. Failing to abide by this restriction can result in infinite loops on systems with more than 64 CPUs (or for more than 32 CPUs on 32-bit systems). This commit therefore adds WARN_ON_ONCE() calls to make misuse of these two macros easier to debug. Reported-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1779cbf33cd1..00ddc92c5774 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -342,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -352,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) -- cgit v1.2.1 From 9925babe83a86067d317fcf0d36502c73ca50f5a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Dec 2019 12:11:56 -0800 Subject: rcutorture: Add 100-CPU configuration The small-system rcutorture configurations have served us well for a great many years, but it is now time to add a larger one. This commit does just that, but does not add it to the defaults in CFLIST. This allows the kvm.sh argument '--configs "4*CFLIST TREE10" to run four instances of each of the default configurations concurrently with one instance of the large configuration. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE10 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 new file mode 100644 index 000000000000..2debe7891aeb --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=100 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TREE_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_OBJECTS=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=n -- cgit v1.2.1 From e9d133ad5632b9830047a6f0550cf0ed20048a3e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2019 12:04:33 -0800 Subject: rcutorture: Summarize summary of build and run results When running the default list of tests, the run summary of a successful (that is, failed to find any errors) run fits easily on a 24-line screen. But a run with something like "--configs '5*CFLIST'" will be 80 lines long, and it is all too easy to miss a failure message when scrolling back. This commit therefore prints out the number of runs with failing builds or runtime failures, but only if there are any such failures. For example, a run with a single build error and a single runtime error would print two lines like this: 1 runs with build errors. 1 runs with runtime errors. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index e5edd5198725..0326f4a5ff9c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -13,6 +13,9 @@ # # Authors: Paul E. McKenney +T=/tmp/kvm-recheck.sh.$$ +trap 'rm -f $T' 0 2 + PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . functions.sh for rd in "$@" @@ -68,4 +71,16 @@ do fi done done -EDITOR=echo kvm-find-errors.sh "${@: -1}" > /dev/null 2>&1 +EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 +ret=$? +builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`" +if test "$builderrors" -gt 0 +then + echo $builderrors runs with build errors. +fi +runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`" +if test "$runerrors" -gt 0 +then + echo $runerrors runs with runtime errors. +fi +exit $ret -- cgit v1.2.1 From 8ffde38cc238b3220a0dbbd99d62e2dda563e9fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2019 12:08:31 -0800 Subject: rcutorture: Make kvm-find-errors.sh abort on bad directory Currently, kvm-find-errors.sh gives a usage prompt when given a bad directory, but then soldiers on, giving a series of confusing error messages. This commit therefore prints an error message and exits when given a bad directory, hopefully reducing confusion. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 1871d00bccd7..6f50722f251f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -20,7 +20,9 @@ rundir="${1}" if test -z "$rundir" -o ! -d "$rundir" then + echo Directory "$rundir" not found. echo Usage: $0 directory + exit 1 fi editor=${EDITOR-vi} -- cgit v1.2.1 From baf6d08e4b5329055e4e6162ef96566583e8e020 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Fri, 6 Dec 2019 13:37:51 +0530 Subject: doc: Add some more RCU list patterns in the kernel - Add more information about RCU list patterns taking examples from audit subsystem in the linux kernel. - Keep the current audit examples, even though the kernel has changed. - Modify inline text for better passage quality. - Fix typo in code-blocks and improve code comments. - Add text formatting (italics, bold and code) for better emphasis. Patch originally submitted at https://lore.kernel.org/patchwork/patch/1082804/ Co-developed-by: Joel Fernandes (Google) Signed-off-by: Amol Grover Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 275 ++++++++++++++++++++++++++++++++---------- 1 file changed, 211 insertions(+), 64 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index 7956ff33042b..55d2b30db481 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -4,12 +4,61 @@ Using RCU to Protect Read-Mostly Linked Lists ============================================= One of the best applications of RCU is to protect read-mostly linked lists -("struct list_head" in list.h). One big advantage of this approach +(``struct list_head`` in list.h). One big advantage of this approach is that all of the required memory barriers are included for you in the list macros. This document describes several applications of RCU, with the best fits first. -Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates + +Example 1: Read-mostly list: Deferred Destruction +------------------------------------------------- + +A widely used usecase for RCU lists in the kernel is lockless iteration over +all processes in the system. ``task_struct::tasks`` represents the list node that +links all the processes. The list can be traversed in parallel to any list +additions or removals. + +The traversal of the list is done using ``for_each_process()`` which is defined +by the 2 macros:: + + #define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + + #define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +The code traversing the list of all processes typically looks like:: + + rcu_read_lock(); + for_each_process(p) { + /* Do something with p */ + } + rcu_read_unlock(); + +The simplified code for removing a process from a task list is:: + + void release_task(struct task_struct *p) + { + write_lock(&tasklist_lock); + list_del_rcu(&p->tasks); + write_unlock(&tasklist_lock); + call_rcu(&p->rcu, delayed_put_task_struct); + } + +When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under +``tasklist_lock`` writer lock protection, to remove the task from the list of +all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals +from corrupting the list. Readers using ``for_each_process()`` are not protected +with the ``tasklist_lock``. To prevent readers from noticing changes in the list +pointers, the ``task_struct`` object is freed only after one or more grace +periods elapse (with the help of call_rcu()). This deferring of destruction +ensures that any readers traversing the list will see valid ``p->tasks.next`` +pointers and deletion/freeing can happen in parallel with traversal of the list. +This pattern is also called an **existence lock**, since RCU pins the object in +memory until all existing readers finish. + + +Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates ---------------------------------------------------------------------- The best applications are cases where, if reader-writer locking were @@ -26,7 +75,7 @@ added or deleted, rather than being modified in place. A straightforward example of this use of RCU may be found in the system-call auditing support. For example, a reader-writer locked -implementation of audit_filter_task() might be as follows:: +implementation of ``audit_filter_task()`` might be as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -34,7 +83,7 @@ implementation of audit_filter_task() might be as follows:: enum audit_state state; read_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { read_unlock(&auditsc_lock); @@ -58,7 +107,7 @@ This means that RCU can be easily applied to the read side, as follows:: enum audit_state state; rcu_read_lock(); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry_rcu(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { rcu_read_unlock(); @@ -69,18 +118,18 @@ This means that RCU can be easily applied to the read side, as follows:: return AUDIT_BUILD_CONTEXT; } -The read_lock() and read_unlock() calls have become rcu_read_lock() +The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock() and rcu_read_unlock(), respectively, and the list_for_each_entry() has -become list_for_each_entry_rcu(). The _rcu() list-traversal primitives +become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives insert the read-side memory barriers that are required on DEC Alpha CPUs. -The changes to the update side are also straightforward. A reader-writer -lock might be used as follows for deletion and insertion:: +The changes to the update side are also straightforward. A reader-writer lock +might be used as follows for deletion and insertion:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; write_lock(&auditsc_lock); list_for_each_entry(e, list, list) { @@ -113,9 +162,9 @@ Following are the RCU equivalents for these two functions:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not use the _rcu iterator here, since this is the only + /* No need to use the _rcu iterator here, since this is the only * deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -139,41 +188,41 @@ Following are the RCU equivalents for these two functions:: return 0; } -Normally, the write_lock() and write_unlock() would be replaced by -a spin_lock() and a spin_unlock(), but in this case, all callers hold -audit_netlink_sem, so no additional locking is required. The auditsc_lock -can therefore be eliminated, since use of RCU eliminates the need for -writers to exclude readers. Normally, the write_lock() calls would -be converted into spin_lock() calls. +Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a +spin_lock() and a spin_unlock(). But in this case, all callers hold +``audit_filter_mutex``, so no additional locking is required. The +``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the +need for writers to exclude readers. The list_del(), list_add(), and list_add_tail() primitives have been replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu(). -The _rcu() list-manipulation primitives add memory barriers that are -needed on weakly ordered CPUs (most of them!). The list_del_rcu() -primitive omits the pointer poisoning debug-assist code that would -otherwise cause concurrent readers to fail spectacularly. +The **_rcu()** list-manipulation primitives add memory barriers that are needed on +weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the +pointer poisoning debug-assist code that would otherwise cause concurrent +readers to fail spectacularly. -So, when readers can tolerate stale data and when entries are either added -or deleted, without in-place modification, it is very easy to use RCU! +So, when readers can tolerate stale data and when entries are either added or +deleted, without in-place modification, it is very easy to use RCU! -Example 2: Handling In-Place Updates + +Example 3: Handling In-Place Updates ------------------------------------ -The system-call auditing code does not update auditing rules in place. -However, if it did, reader-writer-locked code to do so might look as -follows (presumably, the field_count is only permitted to decrease, -otherwise, the added fields would need to be filled in):: +The system-call auditing code does not update auditing rules in place. However, +if it did, the reader-writer-locked code to do so might look as follows +(assuming only ``field_count`` is updated, otherwise, the added fields would +need to be filled in):: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; write_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { e->rule.action = newaction; @@ -188,16 +237,16 @@ otherwise, the added fields would need to be filled in):: The RCU version creates a copy, updates the copy, then replaces the old entry with the newly updated entry. This sequence of actions, allowing -concurrent reads while doing a copy to perform an update, is what gives -RCU ("read-copy update") its name. The RCU code is as follows:: +concurrent reads while making a copy to perform an update, is what gives +RCU (*read-copy update*) its name. The RCU code is as follows:: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -215,34 +264,45 @@ RCU ("read-copy update") its name. The RCU code is as follows:: return -EFAULT; /* No matching rule */ } -Again, this assumes that the caller holds audit_netlink_sem. Normally, -the reader-writer lock would become a spinlock in this sort of code. +Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the +writer lock would become a spinlock in this sort of code. -Example 3: Eliminating Stale Data +Another use of this pattern can be found in the openswitch driver's *connection +tracking table* code in ``ct_limit_set()``. The table holds connection tracking +entries and has a limit on the maximum entries. There is one such table +per-zone and hence one *limit* per zone. The zones are mapped to their limits +through a hashtable using an RCU-managed hlist for the hash chains. When a new +limit is set, a new limit object is allocated and ``ct_limit_set()`` is called +to replace the old limit object with the new one using list_replace_rcu(). +The old limit object is then freed after a grace period using kfree_rcu(). + + +Example 4: Eliminating Stale Data --------------------------------- -The auditing examples above tolerate stale data, as do most algorithms +The auditing example above tolerates stale data, as do most algorithms that are tracking external state. Because there is a delay from the time the external state changes before Linux becomes aware of the change, -additional RCU-induced staleness is normally not a problem. +additional RCU-induced staleness is generally not a problem. However, there are many examples where stale data cannot be tolerated. One example in the Linux kernel is the System V IPC (see the ipc_lock() -function in ipc/util.c). This code checks a "deleted" flag under a -per-entry spinlock, and, if the "deleted" flag is set, pretends that the +function in ipc/util.c). This code checks a *deleted* flag under a +per-entry spinlock, and, if the *deleted* flag is set, pretends that the entry does not exist. For this to be helpful, the search function must -return holding the per-entry spinlock, as ipc_lock() does in fact do. +return holding the per-entry lock, as ipc_lock() does in fact do. + +.. _quick_quiz: Quick Quiz: - Why does the search function need to return holding the per-entry lock for - this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? -:ref:`Answer to Quick Quiz ` +:ref:`Answer to Quick Quiz ` -If the system-call audit module were to ever need to reject stale data, -one way to accomplish this would be to add a "deleted" flag and a "lock" -spinlock to the audit_entry structure, and modify audit_filter_task() -as follows:: +If the system-call audit module were to ever need to reject stale data, one way +to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the +audit_entry structure, and modify ``audit_filter_task()`` as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -267,20 +327,20 @@ as follows:: } Note that this example assumes that entries are only added and deleted. -Additional mechanism is required to deal correctly with the -update-in-place performed by audit_upd_rule(). For one thing, -audit_upd_rule() would need additional memory barriers to ensure -that the list_add_rcu() was really executed before the list_del_rcu(). +Additional mechanism is required to deal correctly with the update-in-place +performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would +need additional memory barriers to ensure that the list_add_rcu() was really +executed before the list_del_rcu(). -The audit_del_rule() function would need to set the "deleted" -flag under the spinlock as follows:: +The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the +spinlock as follows:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not need to use the _rcu iterator here, since this + /* No need to use the _rcu iterator here, since this * is the only deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -295,6 +355,91 @@ flag under the spinlock as follows:: return -EFAULT; /* No matching rule */ } +This too assumes that the caller holds ``audit_filter_mutex``. + + +Example 5: Skipping Stale Objects +--------------------------------- + +For some usecases, reader performance can be improved by skipping stale objects +during read-side list traversal if the object in concern is pending destruction +after one or more grace periods. One such example can be found in the timerfd +subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to +setting of the system time, then all programmed timerfds that depend on this +clock get triggered and processes waiting on them to expire are woken up in +advance of their scheduled expiry. To facilitate this, all such timers are added +to an RCU-managed ``cancel_list`` when they are setup in +``timerfd_setup_cancel()``:: + + static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) + { + spin_lock(&ctx->cancel_lock); + if ((ctx->clockid == CLOCK_REALTIME && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } + spin_unlock(&ctx->cancel_lock); + } + +When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the +timerfd object is cleared, the object removed from the ``cancel_list`` and +destroyed:: + + int timerfd_release(struct inode *inode, struct file *file) + { + struct timerfd_ctx *ctx = file->private_data; + + spin_lock(&ctx->cancel_lock); + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } + spin_unlock(&ctx->cancel_lock); + + hrtimer_cancel(&ctx->t.tmr); + kfree_rcu(ctx, rcu); + return 0; + } + +If the ``CLOCK_REALTIME`` clock is set, for example by a time server, the +hrtimer framework calls ``timerfd_clock_was_set()`` which walks the +``cancel_list`` and wakes up processes waiting on the timerfd. While iterating +the ``cancel_list``, the ``might_cancel`` flag is consulted to skip stale +objects:: + + void timerfd_clock_was_set(void) + { + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs != ktime_mono_to_real(0)) { + ctx->moffs = KTIME_MAX; + ctx->ticks++; + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); + } + +The key point here is, because RCU-traversal of the ``cancel_list`` happens +while objects are being added and removed to the list, sometimes the traversal +can step on an object that has been removed from the list. In this example, it +is seen that it is better to skip such objects using a flag. + + Summary ------- @@ -303,19 +448,21 @@ the most amenable to use of RCU. The simplest case is where entries are either added or deleted from the data structure (or atomically modified in place), but non-atomic in-place modifications can be handled by making a copy, updating the copy, then replacing the original with the copy. -If stale data cannot be tolerated, then a "deleted" flag may be used +If stale data cannot be tolerated, then a *deleted* flag may be used in conjunction with a per-entry spinlock in order to allow the search function to reject newly deleted data. -.. _answer_quick_quiz_list: +.. _quick_quiz_answer: Answer to Quick Quiz: - Why does the search function need to return holding the per-entry - lock for this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? If the search function drops the per-entry lock before returning, then the caller will be processing stale data in any case. If it is really OK to be processing stale data, then you don't need a - "deleted" flag. If processing stale data really is a problem, + *deleted* flag. If processing stale data really is a problem, then you need to hold the per-entry lock across all of the code that uses the value that was returned. + +:ref:`Back to Quick Quiz ` -- cgit v1.2.1 From d619ea119e31c924a08f9f456bd808927da95493 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Dec 2019 10:41:48 -0800 Subject: rcutorture: Fix rcu_torture_one_read()/rcu_torture_writer() data race The ->rtort_pipe_count field in the rcu_torture structure checks for too-short grace periods, and is therefore read by rcutorture's readers while being updated by rcutorture's writers. This commit therefore adds the needed READ_ONCE() and WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5efd9503df56..edd97465a0f7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - i = rp->rtort_pipe_count; + i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1291,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) atomic_inc(&n_rcu_torture_mberror); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; -- cgit v1.2.1 From ab4bc61b27155cb225dd713745574bd1e06b5be7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Dec 2019 11:23:50 -0800 Subject: rcutorture: Fix stray access to rcu_fwd_cb_nodelay The rcu_fwd_cb_nodelay variable suppresses excessively long read-side delays while carrying out an rcutorture forward-progress test. As such, it is accessed both by readers and updaters, and most of the accesses therefore use *_ONCE(). Except for one in rcu_read_delay(), which this commit fixes. This data race was reported by KCSAN. Not appropriate for backporting due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index edd97465a0f7..124160a610fa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!READ_ONCE(rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); -- cgit v1.2.1 From 5693b270d4ec4594f65909a8cd0cd6013c57bb92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:32:54 -0800 Subject: srcu: Fix __call_srcu()/process_srcu() datarace The srcu_node structure's ->srcu_gp_seq_needed_exp field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..b1edac93e403 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); @@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp == sdp->mynode) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } -- cgit v1.2.1 From e81d6efc0ba3a771e28d6d7ea209972ccc31e491 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:36:33 -0800 Subject: srcu: Fix __call_srcu()/srcu_get_delay() datarace The srcu_struct structure's ->srcu_gp_seq_needed_exp field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b1edac93e403..79848f7d575d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); } @@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && -- cgit v1.2.1 From 9b5e417138b38e55ce487e2152a0000e22ef4b41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:39:35 -0800 Subject: srcu: Fix process_srcu()/srcu_batches_completed() datarace The srcu_struct structure's ->srcu_idx field is accessed locklessly, so reads must use READ_ONCE(). This commit therefore adds the needed READ_ONCE() invocation where it was missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 79848f7d575d..119a37319e67 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_idx); } EXPORT_SYMBOL_GPL(srcu_batches_completed); -- cgit v1.2.1 From 2c03bafe6df2fc8b91afd0397f1351e7ca20e302 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Dec 2019 19:55:50 -0800 Subject: rcu: Fix exp_funnel_lock()/rcu_exp_wait_wake() datarace The rcu_node structure's ->exp_seq_rq field is accessed locklessly, so updates must use WRITE_ONCE(). This commit therefore adds the needed WRITE_ONCE() invocation where it was missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 677446373b38..a14571bec6e2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -589,7 +589,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; + WRITE_ONCE(rnp->exp_seq_rq, s); spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ -- cgit v1.2.1 From 6ea79ace4437f5294158a2a14206da50d03d7e50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jan 2020 16:48:05 -0800 Subject: rcu: Provide debug symbols and line numbers in KCSAN runs This commit adds "-g -fno-omit-frame-pointer" to ease interpretation of KCSAN output, but only for CONFIG_KCSAN=y kerrnels. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 82d5fba48b2f..f91f2c2cf138 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,6 +3,10 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o -- cgit v1.2.1 From 998a098ecb9317fc224335ec13887d38e468b8cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 11:38:51 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->qsmask update The rcu_node structure's ->qsmask field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48fba2257748..e341ad7db734 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1898,7 +1898,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); - rnp->qsmask &= ~mask; + WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, -- cgit v1.2.1 From 7cb1ab97c5b16ca5be3699f6c961df6d3ab544d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 11:42:05 -0800 Subject: srcu: Add READ_ONCE() to srcu_struct ->srcu_gp_seq load The load of the srcu_struct structure's ->srcu_gp_seq field in srcu_funnel_gp_start() is lockless, so this commit adds the requisite READ_ONCE(). This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 119a37319e67..90ab47599bb6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -678,7 +678,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { + rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); if (likely(srcu_init_done)) -- cgit v1.2.1 From b78f0ddda8aada15b5e78038961159edc025fdfd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 12:12:06 -0800 Subject: rcu: Add READ_ONCE to rcu_node ->exp_seq_rq store The rcu_node structure's ->exp_seq_rq field is read locklessly, so this commit adds the READ_ONCE() to a load in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a14571bec6e2..6810df709ceb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -314,7 +314,7 @@ static bool exp_funnel_lock(unsigned long s) sync_exp_work_done(s)); return true; } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ + WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("nxtlvl")); -- cgit v1.2.1 From 65db4eaf007b517b5ec6d3665860750a1b0a4574 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 14:18:12 -0800 Subject: rcu: Add *_ONCE() to rcu_node ->exp_tasks plus locking There are lockless loads from the rcu_node structure's ->exp_tasks field, so this commit causes all stores to use WRITE_ONCE() and all lockless loads to use READ_ONCE(). This code also did a unprotected traversal of the linked list pointed into by ->exp_tasks, so this commit also acquires the rcu_node structure's ->lock to properly protect this traversal. This list was traversed unprotected only when printing an RCU CPU stall warning for an expedited grace period, so the odds of seeing this in production are not all that high. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 13 ++++++++----- kernel/rcu/tree_plugin.h | 11 ++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6810df709ceb..a64dd9553827 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -538,7 +538,7 @@ static void synchronize_rcu_expedited_wait(void) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + ".T"[!!READ_ONCE(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -549,7 +549,7 @@ static void synchronize_rcu_expedited_wait(void) pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + ".T"[!!READ_ONCE(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -716,9 +716,11 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rnp->exp_tasks) return 0; t = list_entry(rnp->exp_tasks->prev, @@ -727,6 +729,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0765784012f8..7a06917637ed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -500,7 +500,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; @@ -615,7 +615,8 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || (rdp->grpmask & READ_ONCE(rnp->expmask)) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. @@ -761,7 +762,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -1036,7 +1037,7 @@ static int rcu_boost_kthread(void *arg) for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); -- cgit v1.2.1 From 51e101de4873b0517ccf6f7e0b088b7f42d04f5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 14:53:31 -0800 Subject: rcu: Add READ_ONCE() to rcu_node ->gp_seq The rcu_node structure's ->gp_seq field is read locklessly, so this commit adds the READ_ONCE() to several loads in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Not appropriate for backporting because this affects only tracing and warnings. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e341ad7db734..4210b8ef7e97 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1133,8 +1133,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, - rnp->level, rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + gp_seq_req, rnp->level, + rnp->grplo, rnp->grphi, s); } /* @@ -1921,7 +1922,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); - oldmask = rnp_c->qsmask; + oldmask = READ_ONCE(rnp_c->qsmask); } /* @@ -2071,7 +2072,7 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; blkd = !!(rnp->qsmask & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; } -- cgit v1.2.1 From 19590269cf70ef6417b91683c849075d98ad4ddf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:17:12 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_state ->gp_req_activity The rcu_state structure's ->gp_req_activity field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4210b8ef7e97..2be0ab4225e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1207,7 +1207,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); if (!rcu_state.gp_kthread) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; @@ -1792,7 +1792,7 @@ static void rcu_gp_cleanup(void) rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); -- cgit v1.2.1 From c3dd31192cf27849f78365c82316447fdd605627 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:22:01 -0800 Subject: rcu: Add READ_ONCE to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the READ_ONCE() to a couple of loads in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7a06917637ed..b110cf384f74 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -761,7 +761,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + __func__, READ_ONCE(rnp->gp_tasks), READ_ONCE(rnp->boost_tasks), READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; @@ -1037,7 +1037,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || READ_ONCE(rnp->exp_tasks)); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); -- cgit v1.2.1 From 3ffe4a4a8411b554a18382223ae54f5c72babe9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:44:23 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->qsmaskinitnext The rcu_state structure's ->qsmaskinitnext field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely for systems not doing incessant CPU-hotplug operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2be0ab4225e0..393b3e3d41fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3444,7 +3444,7 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; oldmask ^= rnp->expmaskinitnext; @@ -3497,7 +3497,7 @@ void rcu_report_dead(unsigned int cpu) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } - rnp->qsmaskinitnext &= ~mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); -- cgit v1.2.1 From 58d68b76a45dfb89fc274745d02ac646b8fe2952 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 15:59:12 -0800 Subject: locking/rtmutex: rcu: Add WRITE_ONCE() to rt_mutex ->owner The rt_mutex structure's ->owner field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 851bbb10819d..c9f090d64f00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + WRITE_ONCE(lock->owner, (struct task_struct *)val); } static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -- cgit v1.2.1 From 73d5ff0244876008d21db2590ae3ad566375e77b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:14:08 -0800 Subject: rcu: Add READ_ONCE() to rcu_segcblist ->tails[] The rcu_segcblist structure's ->tails[] array entries are read locklessly, so this commit adds the READ_ONCE() to a load in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5f4fd3b8777c..426a472e7308 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* -- cgit v1.2.1 From 3d5bcd6c7f8bcc31b24299d83b8c78a6f5d4330d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:27:00 -0800 Subject: rcutorture: Add READ_ONCE() to rcu_torture_count and rcu_torture_batch The rcutorture rcu_torture_count and rcu_torture_batch per-CPU variables are read locklessly, so this commit adds the READ_ONCE() to a load in order to avoid various types of compiler vandalism^Woptimization. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 124160a610fa..0b9ce9a00623 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1413,8 +1413,8 @@ rcu_torture_stats_print(void) for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { -- cgit v1.2.1 From aa5910cc9c61cd689350d63e9335226cadc3eb9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2020 16:36:59 -0800 Subject: srcu: Add READ_ONCE() to ->srcu_lock_count and ->srcu_unlock_count arrays The srcu_data structure's ->srcu_lock_count and ->srcu_unlock_count arrays are read and written locklessly, so this commit adds the READ_ONCE() to the loads from these arrays in order to avoid various types of compiler optimizations. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being used only by rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 90ab47599bb6..3c4e6441bbf9 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1266,8 +1266,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = READ_ONCE(sdp->srcu_unlock_count[!idx]); + u1 = READ_ONCE(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1275,8 +1275,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = READ_ONCE(sdp->srcu_lock_count[!idx]); + l1 = READ_ONCE(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; -- cgit v1.2.1 From eec852ec9b6e6dd8893f52f20f3714d8a9e02ac4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Jan 2020 10:44:41 -0800 Subject: rcu: Add WRITE_ONCE() to rcu_node ->boost_tasks The rcu_node structure's ->boost_tasks field is read locklessly, so this commit adds the WRITE_ONCE() to an update in order to provide proper documentation and READ_ONCE()/WRITE_ONCE() pairing. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b110cf384f74..3b1415f03702 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -505,7 +505,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -1083,7 +1083,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->qsmask == 0 && (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, rnp->boost_kthread_status); -- cgit v1.2.1 From 45369ce571201700979cccbb7e55ef625f66301f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Jan 2020 11:33:17 -0800 Subject: rcu: *_ONCE() for grace-period progress indicators The various RCU structures' ->gp_seq, ->gp_seq_needed, ->gp_req_activity, and ->gp_activity fields are read locklessly, so they must be updated with WRITE_ONCE() and, when read locklessly, with READ_ONCE(). This commit makes these changes. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_stall.h | 26 +++++++++++++++----------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 393b3e3d41fc..982c7b48cd9b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1182,7 +1182,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, TPS("Prestarted")); goto unlock_out; } - rnp->gp_seq_needed = gp_seq_req; + WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf or internal node, and a @@ -1217,8 +1217,8 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { - rnp_start->gp_seq_needed = rnp->gp_seq_needed; - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); } if (rnp != rnp_start) raw_spin_unlock_rcu_node(rnp); @@ -1433,7 +1433,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -3341,12 +3341,12 @@ int rcutree_prepare_cpu(unsigned int cpu) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gp_seq = rnp->gp_seq; - rdp->gp_seq_needed = rnp->gp_seq; + rdp->gp_seq = READ_ONCE(rnp->gp_seq); + rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3b1415f03702..fa057acee373 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -756,7 +756,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - (long)rnp->gp_seq, (long)rnp->completedqs); + (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 7ee8a1cc0d8b..db2031a9c74e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -592,21 +592,22 @@ void show_rcu_gp_kthreads(void) (long)READ_ONCE(rcu_get_root()->gp_seq_needed), READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)rnp->gp_seq, - (long)rnp->gp_seq_needed); + rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), + (long)READ_ONCE(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)rdp->gp_seq_needed); + cpu, (long)READ_ONCE(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -631,7 +632,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, static atomic_t warned = ATOMIC_INIT(0); if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed))) return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || @@ -642,7 +644,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { @@ -655,9 +658,10 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_xchg(&warned, 1)) { if (rnp_root != rnp) /* irqs remain disabled. */ -- cgit v1.2.1 From 43c688faca313be1413088cf4e9e8f21dc21eecb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Jan 2020 11:59:58 -0800 Subject: rcu-tasks: *_ONCE() for rcu_tasks_cbs_head The RCU tasks list of callbacks, rcu_tasks_cbs_head, is sampled locklessly by rcu_tasks_kthread() when waiting for work to do. This commit therefore applies READ_ONCE() to that lockless sampling and WRITE_ONCE() to the single potential store outside of rcu_tasks_kthread. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 085f08a898fe..6f7be409586c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -546,7 +546,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rhp->func = func; raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); needwake = !rcu_tasks_cbs_head; - *rcu_tasks_cbs_tail = rhp; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ @@ -676,7 +676,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* If there were none, wait a bit and start over. */ if (!list) { wait_event_interruptible(rcu_tasks_cbs_wq, - rcu_tasks_cbs_head); + READ_ONCE(rcu_tasks_cbs_head)); if (!rcu_tasks_cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); -- cgit v1.2.1 From f6c84d82e351a8d8a01d87be0f09e00d97e16c6d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 7 Jan 2020 17:31:04 +0100 Subject: kcsan: Prefer __always_inline for fast-path Prefer __always_inline for fast-path functions that are called outside of user_access_save, to avoid generating UACCESS warnings when optimizing for size (CC_OPTIMIZE_FOR_SIZE). It will also avoid future surprises with compiler versions that change the inlining heuristic even when optimizing for performance. Report: http://lkml.kernel.org/r/58708908-84a0-0a81-a836-ad97e33dbb62@infradead.org Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 18 +++++++++--------- kernel/kcsan/encoding.h | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index 576e03ddd6a3..a9c193053491 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -18,7 +18,7 @@ * than cast to volatile. Eventually, we hope to be able to remove this * function. */ -static inline bool kcsan_is_atomic(const volatile void *ptr) +static __always_inline bool kcsan_is_atomic(const volatile void *ptr) { /* only jiffies for now */ return ptr == &jiffies; diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3314fc29e236..4d4ab5c5dc53 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -78,10 +78,10 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, - size_t size, - bool expect_write, - long *encoded_watchpoint) +static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, + bool expect_write, + long *encoded_watchpoint) { const int slot = watchpoint_slot(addr); const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK; @@ -146,7 +146,7 @@ insert_watchpoint(unsigned long addr, size_t size, bool is_write) * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool +static __always_inline bool try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); @@ -160,7 +160,7 @@ static inline bool remove_watchpoint(atomic_long_t *watchpoint) return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } -static inline struct kcsan_ctx *get_ctx(void) +static __always_inline struct kcsan_ctx *get_ctx(void) { /* * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would @@ -169,7 +169,7 @@ static inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } -static inline bool is_atomic(const volatile void *ptr) +static __always_inline bool is_atomic(const volatile void *ptr) { struct kcsan_ctx *ctx = get_ctx(); @@ -193,7 +193,7 @@ static inline bool is_atomic(const volatile void *ptr) return kcsan_is_atomic(ptr); } -static inline bool should_watch(const volatile void *ptr, int type) +static __always_inline bool should_watch(const volatile void *ptr, int type) { /* * Never set up watchpoints when memory operations are atomic. @@ -226,7 +226,7 @@ static inline void reset_kcsan_skip(void) this_cpu_write(kcsan_skip, skip_count); } -static inline bool kcsan_is_enabled(void) +static __always_inline bool kcsan_is_enabled(void) { return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index b63890e86449..f03562aaf2eb 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -59,10 +59,10 @@ encode_watchpoint(unsigned long addr, size_t size, bool is_write) (addr & WATCHPOINT_ADDR_MASK)); } -static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, - size_t *size, - bool *is_write) +static __always_inline bool decode_watchpoint(long watchpoint, + unsigned long *addr_masked, + size_t *size, + bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) @@ -78,13 +78,13 @@ static inline bool decode_watchpoint(long watchpoint, /* * Return watchpoint slot for an address. */ -static inline int watchpoint_slot(unsigned long addr) +static __always_inline int watchpoint_slot(unsigned long addr) { return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS; } -static inline bool matching_access(unsigned long addr1, size_t size1, - unsigned long addr2, size_t size2) +static __always_inline bool matching_access(unsigned long addr1, size_t size1, + unsigned long addr2, size_t size2) { unsigned long end_range1 = addr1 + size1 - 1; unsigned long end_range2 = addr2 + size2 - 1; -- cgit v1.2.1 From 9bfbccc6cc6113fe9c8bd4e628d244c2273cba6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:56 +0100 Subject: doc/RCU/Design: Remove remaining HTML tags in ReST files Commit ccc9971e2147 ("docs: rcu: convert some articles from html to ReST") has converted a few of html RCU docs into ReST files, but a few of html tags which not supported on rst is remaining. This commit converts those to ReST appropriate alternatives. Reviewed-by: Madhuparna Bhowmik Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 1a8b129cfc04..83ae3b79a643 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -4,7 +4,7 @@ A Tour Through TREE_RCU's Grace-Period Memory Ordering August 8, 2017 -This article was contributed by Paul E. McKenney +This article was contributed by Paul E. McKenney Introduction ============ @@ -48,7 +48,7 @@ Tree RCU Grace Period Memory Ordering Building Blocks The workhorse for RCU's grace-period memory ordering is the critical section for the ``rcu_node`` structure's -``->lock``. These critical sections use helper functions for lock +``->lock``. These critical sections use helper functions for lock acquisition, including ``raw_spin_lock_rcu_node()``, ``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``. Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``, @@ -102,9 +102,9 @@ lock-acquisition and lock-release functions:: 23 r3 = READ_ONCE(x); 24 } 25 - 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); + 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); -The ``WARN_ON()`` is evaluated at “the end of time”, +The ``WARN_ON()`` is evaluated at "the end of time", after all changes have propagated throughout the system. Without the ``smp_mb__after_unlock_lock()`` provided by the acquisition functions, this ``WARN_ON()`` could trigger, for example -- cgit v1.2.1 From 45abaee2653a3f6ee2cc14d9596619e94d7002b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:57 +0100 Subject: doc/RCU/listRCU: Fix typos in a example code snippets Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index 55d2b30db481..e768f56e8fa3 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -226,7 +226,7 @@ need to be filled in):: list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { e->rule.action = newaction; - e->rule.file_count = newfield_count; + e->rule.field_count = newfield_count; write_unlock(&auditsc_lock); return 0; } @@ -255,7 +255,7 @@ RCU (*read-copy update*) its name. The RCU code is as follows:: return -ENOMEM; audit_copy_rule(&ne->rule, &e->rule); ne->rule.action = newaction; - ne->rule.file_count = newfield_count; + ne->rule.field_count = newfield_count; list_replace_rcu(&e->list, &ne->list); call_rcu(&e->rcu, audit_free_rule); return 0; -- cgit v1.2.1 From 4ec1918fe0e4ceafba35fd2cae6e950ca8a22108 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:58 +0100 Subject: doc/RCU/listRCU: Update example function name listRCU.rst document gives an example with 'ipc_lock()', but the function has dropped off by commit 82061c57ce93 ("ipc: drop ipc_lock()"). Because the main logic of 'ipc_lock()' has melded in 'shm_lock()' by the commit, this commit updates the document to use 'shm_lock()' instead. Reviewed-by: Madhuparna Bhowmik Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index e768f56e8fa3..2a643e293fb4 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -286,11 +286,11 @@ time the external state changes before Linux becomes aware of the change, additional RCU-induced staleness is generally not a problem. However, there are many examples where stale data cannot be tolerated. -One example in the Linux kernel is the System V IPC (see the ipc_lock() -function in ipc/util.c). This code checks a *deleted* flag under a +One example in the Linux kernel is the System V IPC (see the shm_lock() +function in ipc/shm.c). This code checks a *deleted* flag under a per-entry spinlock, and, if the *deleted* flag is set, pretends that the entry does not exist. For this to be helpful, the search function must -return holding the per-entry lock, as ipc_lock() does in fact do. +return holding the per-entry spinlock, as shm_lock() does in fact do. .. _quick_quiz: -- cgit v1.2.1 From a999ccc77095336d34760e48603d5123a24d6771 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:07:59 +0100 Subject: doc/RCU/rcu: Use ':ref:' for links to other docs Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 8dfb437dacc3..a1dd71d01862 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -11,8 +11,8 @@ must be long enough that any readers accessing the item being deleted have since dropped their references. For example, an RCU-protected deletion from a linked list would first remove the item from the list, wait for a grace period to elapse, then free the element. See the -Documentation/RCU/listRCU.rst file for more information on using RCU with -linked lists. +:ref:`Documentation/RCU/listRCU.rst ` for more information on +using RCU with linked lists. Frequently Asked Questions -------------------------- @@ -50,7 +50,7 @@ Frequently Asked Questions - If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? - See the Documentation/RCU/UP.rst file for more information. + See :ref:`Documentation/RCU/UP.rst ` for more information. - How can I see where RCU is currently used in the Linux kernel? @@ -68,9 +68,9 @@ Frequently Asked Questions - Why the name "RCU"? - "RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst - has more information on where this name came from, search for - "read-copy update" to find it. + "RCU" stands for "read-copy update". + :ref:`Documentation/RCU/listRCU.rst ` has more information on where + this name came from, search for "read-copy update" to find it. - I hear that RCU is patented? What is with that? -- cgit v1.2.1 From 8235c6a6363b531273816bde40a907e59b61c225 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:00 +0100 Subject: doc/RCU/rcu: Use absolute paths for non-rst files Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index a1dd71d01862..2a830c51477e 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -75,7 +75,7 @@ Frequently Asked Questions - I hear that RCU is patented? What is with that? Yes, it is. There are several known patents related to RCU, - search for the string "Patent" in RTFP.txt to find them. + search for the string "Patent" in Documentation/RCU/RTFP.txt to find them. Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. There are now also LGPL implementations of user-level RCU @@ -88,5 +88,5 @@ Frequently Asked Questions - Where can I find more information on RCU? - See the RTFP.txt file in this directory. + See the Documentation/RCU/RTFP.txt file. Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/). -- cgit v1.2.1 From d955486631bdeb4ba3c8a8e92d0de0094c308226 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:01 +0100 Subject: doc/RCU/rcu: Use https instead of http if possible Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 2a830c51477e..0e03c6ef3147 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -79,7 +79,7 @@ Frequently Asked Questions Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. There are now also LGPL implementations of user-level RCU - available (http://liburcu.org/). + available (https://liburcu.org/). - I hear that RCU needs work in order to support realtime kernels? -- cgit v1.2.1 From f49d5319bbf0a368e749f0b37f508a727fdde703 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2020 21:08:02 +0100 Subject: rcu: Fix typos in beginning comments Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- kernel/rcu/tree.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3c4e6441bbf9..0b561480e6c4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney + * Authors: Paul McKenney * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 982c7b48cd9b..8367fc080801 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma * Manfred Spraul - * Paul E. McKenney Hierarchical version + * Paul E. McKenney * * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. -- cgit v1.2.1 From 53c0e1fe6eb640510a5e9d2205e343399c827b2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jan 2020 15:48:39 -0800 Subject: rcu: Add READ_ONCE() to rcu_data ->gpwrap The rcu_data structure's ->gpwrap field is read locklessly, and so this commit adds the required READ_ONCE() to a pair of laods in order to avoid destructive compiler optimizations. This data race was reported by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_stall.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8367fc080801..7e0048cd6389 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1329,7 +1329,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index db2031a9c74e..c1e165b97821 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -602,7 +602,7 @@ void show_rcu_gp_kthreads(void) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || + if (READ_ONCE(rdp->gpwrap) || ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rdp->gp_seq_needed))) continue; -- cgit v1.2.1 From e5622c00d4f01e10ddbc5b02b94f452503d5c17e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 19:58:13 -0800 Subject: rcutorture: Annotation lockless accesses to rcu_torture_current The rcutorture global variable rcu_torture_current is accessed locklessly, so it must use the RCU pointer load/store primitives. This commit therefore adds several that were missed. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely and due to this being used only by rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0b9ce9a00623..7e01e9a87352 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1407,6 +1407,7 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; @@ -1423,10 +1424,10 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); + rtcp = rcu_access_pointer(rcu_torture_current); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current && !rcu_stall_is_suppressed_at_boot() - ? "ver" : "VER", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1482,7 +1483,8 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL && !rcu_stall_is_suppressed()) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; -- cgit v1.2.1 From 5aaa108cd28695e549ad74b77d22c84aab02eaa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 20:06:25 -0800 Subject: rcu: Add *_ONCE() to rcu_data ->rcu_forced_tick The rcu_data structure's ->rcu_forced_tick field is read locklessly, so this commit adds WRITE_ONCE() to all updates and READ_ONCE() to all lockless reads. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7e0048cd6389..73e13b530923 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -824,11 +824,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { raw_spin_lock_rcu_node(rdp->mynode); // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; + WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); @@ -905,7 +906,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - rdp->rcu_forced_tick = false; + WRITE_ONCE(rdp->rcu_forced_tick, false); } } -- cgit v1.2.1 From bead35304881355a255a1fcf94a317079d1b04c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2020 20:12:59 -0800 Subject: rcu: Add *_ONCE() to rcu_node ->boost_kthread_status The rcu_node structure's ->boost_kthread_status field is accessed locklessly, so this commit causes all updates to use WRITE_ONCE() and all reads to use READ_ONCE(). This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa057acee373..a7d7c279c6bc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1035,19 +1035,19 @@ static int rcu_boost_kthread(void *arg) trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(READ_ONCE(rnp->boost_tasks) || READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); if (more2boost) spincnt++; else spincnt = 0; if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); @@ -1086,7 +1086,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, - rnp->boost_kthread_status); + READ_ONCE(rnp->boost_kthread_status)); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit v1.2.1