diff options
681 files changed, 12345 insertions, 12211 deletions
diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt index 82001a25a14b..1f246eb25ca5 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/IRQ-domain.txt @@ -231,5 +231,42 @@ needs to: 4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap, they are unused with hierarchy irq_domain. -Hierarchy irq_domain may also be used to support other architectures, -such as ARM, ARM64 etc. +Hierarchy irq_domain is in no way x86 specific, and is heavily used to +support other architectures, such as ARM, ARM64 etc. + +=== Debugging === + +If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on +CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in +your debugfs mount point, called irq_domain_mapping. This file +contains a live snapshot of all the IRQ domains in the system: + + name mapped linear-max direct-max devtree-node + pl061 8 8 0 /smb/gpio@e0080000 + pl061 8 8 0 /smb/gpio@e1050000 + pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000 + MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 + GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 + GICv2 448 448 0 /interrupt-controller@e1101000 + +it also iterates over the interrupts to display their mapping in the +domains, and makes the domain stacking visible: + + +irq hwirq chip name chip data active type domain + 1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2 + 3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2 +[...] + 96 0x81808 MSI 0x (null) RADIX MSI + 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m + 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2 + 97 0x08800 MSI 0x (null) * RADIX MSI + 97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m + 97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2 + +Here, interrupts 1-5 are only using a single domain, while 96 and 97 +are build out of a stack of three domain, each level performing a +particular function. diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 1672573b037a..f46980c060aa 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -28,8 +28,6 @@ stallwarn.txt - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress) torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) -trace.txt - - CONFIG_RCU_TRACE debugfs files and formats UP.txt - RCU on Uniprocessor Systems whatisRCU.txt diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index f60adf112663..95b30fa25d56 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -559,9 +559,7 @@ The <tt>rcu_access_pointer()</tt> on line 6 is similar to For <tt>remove_gp_synchronous()</tt>, as long as all modifications to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, the above optimizations are harmless. - However, - with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, - <tt>sparse</tt> will complain if you + However, <tt>sparse</tt> will complain if you define <tt>gp</tt> with <tt>__rcu</tt> and then access it without using either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. @@ -1849,7 +1847,8 @@ mass storage, or user patience, whichever comes first. If the nesting is not visible to the compiler, as is the case with mutually recursive functions each in its own translation unit, stack overflow will result. -If the nesting takes the form of loops, either the control variable +If the nesting takes the form of loops, perhaps in the guise of tail +recursion, either the control variable will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. Nevertheless, this class of RCU implementations is one of the most composable constructs in existence. @@ -1977,9 +1976,8 @@ guard against mishaps and misuse: and <tt>rcu_dereference()</tt>, perhaps (incorrectly) substituting a simple assignment. To catch this sort of error, a given RCU-protected pointer may be - tagged with <tt>__rcu</tt>, after which running sparse - with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain - about simple-assignment accesses to that pointer. + tagged with <tt>__rcu</tt>, after which sparse + will complain about simple-assignment accesses to that pointer. Arnd Bergmann made me aware of this requirement, and also supplied the needed <a href="https://lwn.net/Articles/376011/">patch series</a>. @@ -2036,7 +2034,7 @@ guard against mishaps and misuse: some other synchronization mechanism, for example, reference counting. <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related - information is provided via both debugfs and event tracing. + information is provided via event tracing. <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt> to create typical linked data structures can be surprisingly error-prone. @@ -2519,11 +2517,7 @@ It is similarly socially unacceptable to interrupt an <tt>nohz_full</tt> CPU running in userspace. RCU must therefore track <tt>nohz_full</tt> userspace execution. -And in -<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in +RCU must therefore be able to sample state at two points in time, and be able to determine whether or not some other CPU spent any time idle and/or executing in userspace. @@ -2936,6 +2930,20 @@ to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt> need not exclude CPU-hotplug operations. <p> +SRCU also differs from other RCU flavors in that SRCU's expedited and +non-expedited grace periods are implemented by the same mechanism. +This means that in the current SRCU implementation, expediting a +future grace period has the side effect of expediting all prior +grace periods that have not yet completed. +(But please note that this is a property of the current implementation, +not necessarily of future implementations.) +In addition, if SRCU has been idle for longer than the interval +specified by the <tt>srcutree.exp_holdoff</tt> kernel boot parameter +(25 microseconds by default), +and if a <tt>synchronize_srcu()</tt> invocation ends this idle period, +that invocation will be automatically expedited. + +<p> As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating a locking bottleneck present in prior kernel versions. Although this will allow users to put much heavier stress on diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 877947130ebe..6beda556faf3 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -413,11 +413,11 @@ over a rather long period of time, but improvements are always welcome! read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. -17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the - __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to - validate your RCU code. These can help find problems as follows: +17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the + __rcu sparse checks to validate your RCU code. These can help + find problems as follows: - CONFIG_PROVE_RCU: check that accesses to RCU-protected data + CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data structures are carried out under the proper RCU read-side critical section, while holding the right combination of locks, or whatever other conditions diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt deleted file mode 100644 index 6549012033f9..000000000000 --- a/Documentation/RCU/trace.txt +++ /dev/null @@ -1,535 +0,0 @@ -CONFIG_RCU_TRACE debugfs Files and Formats - - -The rcutree and rcutiny implementations of RCU provide debugfs trace -output that summarizes counters and state. This information is useful for -debugging RCU itself, and can sometimes also help to debug abuses of RCU. -The following sections describe the debugfs files and formats, first -for rcutree and next for rcutiny. - - -CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats - -These implementations of RCU provide several debugfs directories under the -top-level directory "rcu": - -rcu/rcu_bh -rcu/rcu_preempt -rcu/rcu_sched - -Each directory contains files for the corresponding flavor of RCU. -Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU. -For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor, -so that activity for both appears in rcu/rcu_sched. - -In addition, the following file appears in the top-level directory: -rcu/rcutorture. This file displays rcutorture test progress. The output -of "cat rcu/rcutorture" looks as follows: - -rcutorture test sequence: 0 (test in progress) -rcutorture update version number: 615 - -The first line shows the number of rcutorture tests that have completed -since boot. If a test is currently running, the "(test in progress)" -string will appear as shown above. The second line shows the number of -update cycles that the current test has started, or zero if there is -no test in progress. - - -Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly -also rcu/rcu_preempt) the following files will be present: - -rcudata: - Displays fields in struct rcu_data. -rcuexp: - Displays statistics for expedited grace periods. -rcugp: - Displays grace-period counters. -rcuhier: - Displays the struct rcu_node hierarchy. -rcu_pending: - Displays counts of the reasons rcu_pending() decided that RCU had - work to do. -rcuboost: - Displays RCU boosting statistics. Only present if - CONFIG_RCU_BOOST=y. - -The output of "cat rcu/rcu_preempt/rcudata" looks as follows: - - 0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 - 1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 - 2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 - 3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 - 4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 - 5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 - 6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 - 7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 - -This file has one line per CPU, or eight for this 8-CPU system. -The fields are as follows: - -o The number at the beginning of each line is the CPU number. - CPUs numbers followed by an exclamation mark are offline, - but have been online at least once since boot. There will be - no output for CPUs that have never been online, which can be - a good thing in the surprisingly common case where NR_CPUS is - substantially larger than the number of actual CPUs. - -o "c" is the count of grace periods that this CPU believes have - completed. Offlined CPUs and CPUs in dynticks idle mode may lag - quite a ways behind, for example, CPU 4 under "rcu_sched" above, - which has been offline through 16 RCU grace periods. It is not - unusual to see offline CPUs lagging by thousands of grace periods. - Note that although the grace-period number is an unsigned long, - it is printed out as a signed long to allow more human-friendly - representation near boot time. - -o "g" is the count of grace periods that this CPU believes have - started. Again, offlined CPUs and CPUs in dynticks idle mode - may lag behind. If the "c" and "g" values are equal, this CPU - has already reported a quiescent state for the last RCU grace - period that it is aware of, otherwise, the CPU believes that it - owes RCU a quiescent state. - -o "pq" indicates that this CPU has passed through a quiescent state - for the current grace period. It is possible for "pq" to be - "1" and "c" different than "g", which indicates that although - the CPU has passed through a quiescent state, either (1) this - CPU has not yet reported that fact, (2) some other CPU has not - yet reported for this grace period, or (3) both. - -o "qp" indicates that RCU still expects a quiescent state from - this CPU. Offlined CPUs and CPUs in dyntick idle mode might - well have qp=1, which is OK: RCU is still ignoring them. - -o "dt" is the current value of the dyntick counter that is incremented - when entering or leaving idle, either due to a context switch or - due to an interrupt. This number is even if the CPU is in idle - from RCU's viewpoint and odd otherwise. The number after the - first "/" is the interrupt nesting depth when in idle state, - or a large number added to the interrupt-nesting depth when - running a non-idle task. Some architectures do not accurately - count interrupt nesting when running in non-idle kernel context, - which can result in interesting anomalies such as negative - interrupt-nesting levels. The number after the second "/" - is the NMI nesting depth. - -o "df" is the number of times that some other CPU has forced a - quiescent state on behalf of this CPU due to this CPU being in - idle state. - -o "of" is the number of times that some other CPU has forced a - quiescent state on behalf of this CPU due to this CPU being - offline. In a perfect world, this might never happen, but it - turns out that offlining and onlining a CPU can take several grace - periods, and so there is likely to be an extended period of time - when RCU believes that the CPU is online when it really is not. - Please note that erring in the other direction (RCU believing a - CPU is offline when it is really alive and kicking) is a fatal - error, so it makes sense to err conservatively. - -o "ql" is the number of RCU callbacks currently residing on - this CPU. The first number is the number of "lazy" callbacks - that are known to RCU to only be freeing memory, and the number - after the "/" is the total number of callbacks, lazy or not. - These counters count callbacks regardless of what phase of - grace-period processing that they are in (new, waiting for - grace period to start, waiting for grace period to end, ready - to invoke). - -o "qs" gives an indication of the state of the callback queue - with four characters: - - "N" Indicates that there are callbacks queued that are not - ready to be handled by the next grace period, and thus - will be handled by the grace period following the next - one. - - "R" Indicates that there are callbacks queued that are - ready to be handled by the next grace period. - - "W" Indicates that there are callbacks queued that are - waiting on the current grace period. - - "D" Indicates that there are callbacks queued that have - already been handled by a prior grace period, and are - thus waiting to be invoked. Note that callbacks in - the process of being invoked are not counted here. - Callbacks in the process of being invoked are those - that have been removed from the rcu_data structures - queues by rcu_do_batch(), but which have not yet been - invoked. - - If there are no callbacks in a given one of the above states, - the corresponding character is replaced by ".". - -o "b" is the batch limit for this CPU. If more than this number - of RCU callbacks is ready to invoke, then the remainder will - be deferred. - -o "ci" is the number of RCU callbacks that have been invoked for - this CPU. Note that ci+nci+ql is the number of callbacks that have - been registered in absence of CPU-hotplug activity. - -o "nci" is the number of RCU callbacks that have been offloaded from - this CPU. This will always be zero unless the kernel was built - with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot - parameter was specified. - -o "co" is the number of RCU callbacks that have been orphaned due to - this CPU going offline. These orphaned callbacks have been moved - to an arbitrarily chosen online CPU. - -o "ca" is the number of RCU callbacks that have been adopted by this - CPU due to other CPUs going offline. Note that ci+co-ca+ql is - the number of RCU callbacks registered on this CPU. - - -Kernels compiled with CONFIG_RCU_BOOST=y display the following from -/debug/rcu/rcu_preempt/rcudata: - - 0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 - 1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 - 2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 - 3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 - 4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 - 5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 - 6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 - 7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 - -This is similar to the output discussed above, but contains the following -additional fields: - -o "kt" is the per-CPU kernel-thread state. The digit preceding - the first slash is zero if there is no work pending and 1 - otherwise. The character between the first pair of slashes is - as follows: - - "S" The kernel thread is stopped, in other words, all - CPUs corresponding to this rcu_node structure are - offline. - - "R" The kernel thread is running. - - "W" The kernel thread is waiting because there is no work - for it to do. - - "O" The kernel thread is waiting because it has been - forced off of its designated CPU or because its - ->cpus_allowed mask permits it to run on other than - its designated CPU. - - "Y" The kernel thread is yielding to avoid hogging CPU. - - "?" Unknown value, indicates a bug. - - The number after the final slash is the CPU that the kthread - is actually running on. - - This field is displayed only for CONFIG_RCU_BOOST kernels. - -o "ktl" is the low-order 16 bits (in hexadecimal) of the count of - the number of times that this CPU's per-CPU kthread has gone - through its loop servicing invoke_rcu_cpu_kthread() requests. - - This field is displayed only for CONFIG_RCU_BOOST kernels. - - -The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: - -s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872 - -These fields are as follows: - -o "s" is the sequence number, with an odd number indicating that - an expedited grace period is in progress. - -o "wd1", "wd2", and "wd3" are the number of times that an attempt - to start an expedited grace period found that someone else had - completed an expedited grace period that satisfies the attempted - request. "Our work is done." - -o "enq" is the number of quiescent states still outstanding. - -o "sc" is the number of times that the attempt to start a - new expedited grace period succeeded. - - -The output of "cat rcu/rcu_preempt/rcugp" looks as follows: - -completed=31249 gpnum=31250 age=1 max=18 - -These fields are taken from the rcu_state structure, and are as follows: - -o "completed" is the number of grace periods that have completed. - It is comparable to the "c" field from rcu/rcudata in that a - CPU whose "c" field matches the value of "completed" is aware - that the corresponding RCU grace period has completed. - -o "gpnum" is the number of grace periods that have started. It is - similarly comparable to the "g" field from rcu/rcudata in that - a CPU whose "g" field matches the value of "gpnum" is aware that - the corresponding RCU grace period has started. - - If these two fields are equal, then there is no grace period - in progress, in other words, RCU is idle. On the other hand, - if the two fields differ (as they are above), then an RCU grace - period is in progress. - -o "age" is the number of jiffies that the current grace period - has extended for, or zero if there is no grace period currently - in effect. - -o "max" is the age in jiffies of the longest-duration grace period - thus far. - -The output of "cat rcu/rcu_preempt/rcuhier" looks as follows: - -c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0 -3/3 ..>. 0:7 ^0 -e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1 - -The fields are as follows: - -o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp. - -o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp. - -o "s" is the current state of the force_quiescent_state() - state machine. - -o "jfq" is the number of jiffies remaining for this grace period - before force_quiescent_state() is invoked to help push things - along. Note that CPUs in idle mode throughout the grace period - will not report on their own, but rather must be check by some - other CPU via force_quiescent_state(). - -o "j" is the low-order four hex digits of the jiffies counter. - Yes, Paul did run into a number of problems that turned out to - be due to the jiffies counter no longer counting. Why do you ask? - -o "nfqs" is the number of calls to force_quiescent_state() since - boot. - -o "nfqsng" is the number of useless calls to force_quiescent_state(), - where there wasn't actually a grace period active. This can - no longer happen due to grace-period processing being pushed - into a kthread. The number in parentheses is the difference - between "nfqs" and "nfqsng", or the number of times that - force_quiescent_state() actually did some real work. - -o "fqlh" is the number of calls to force_quiescent_state() that - exited immediately (without even being counted in nfqs above) - due to contention on ->fqslock. - -o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node - structure. Each line represents one level of the hierarchy, - from root to leaves. It is best to think of the rcu_data - structures as forming yet another level after the leaves. - Note that there might be either one, two, three, or even four - levels of rcu_node structures, depending on the relationship - between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly - adjusted using the rcu_fanout_leaf kernel boot parameter), and - CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of - possible CPUs for the booting hardware). - - o The numbers separated by the "/" are the qsmask followed - by the qsmaskinit. The qsmask will have one bit - set for each entity in the next lower level that has - not yet checked in for the current grace period ("e" - indicating CPUs 5, 6, and 7 in the example above). - The qsmaskinit will have one bit for each entity that is - currently expected to check in during each grace period. - The value of qsmaskinit is assigned to that of qsmask - at the beginning of each grace period. - - o The characters separated by the ">" indicate the state - of the blocked-tasks lists. A "G" preceding the ">" - indicates that at least one task blocked in an RCU - read-side critical section blocks the current grace - period, while a "E" preceding the ">" indicates that - at least one task blocked in an RCU read-side critical - section blocks the current expedited grace period. - A "T" character following the ">" indicates that at - least one task is blocked within an RCU read-side - critical section, regardless of whether any current - grace period (expedited or normal) is inconvenienced. - A "." character appears if the corresponding condition - does not hold, so that "..>." indicates that no tasks - are blocked. In contrast, "GE>T" indicates maximal - inconvenience from blocked tasks. CONFIG_TREE_RCU - builds of the kernel will always show "..>.". - - o The numbers separated by the ":" are the range of CPUs - served by this struct rcu_node. This can be helpful - in working out how the hierarchy is wired together. - - For example, the example rcu_node structure shown above - has "0:7", indicating that it covers CPUs 0 through 7. - - o The number after the "^" indicates the bit in the - next higher level rcu_node structure that this rcu_node - structure corresponds to. For example, the "d/d ..>. 4:7 - ^1" has a "1" in this position, indicating that it - corresponds to the "1" bit in the "3" shown in the - "3/3 ..>. 0:7 ^0" entry on the next level up. - - -The output of "cat rcu/rcu_sched/rcu_pending" looks as follows: - - 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0 - 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0 - 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0 - 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0 - 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0 - 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0 - 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0 - 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0 - -The fields are as follows: - -o The leading number is the CPU number, with "!" indicating - an offline CPU. - -o "np" is the number of times that __rcu_pending() has been invoked - for the corresponding flavor of RCU. - -o "qsp" is the number of times that the RCU was waiting for a - quiescent state from this CPU. - -o "rpq" is the number of times that the CPU had passed through - a quiescent state, but not yet reported it to RCU. - -o "cbr" is the number of times that this CPU had RCU callbacks - that had passed through a grace period, and were thus ready - to be invoked. - -o "cng" is the number of times that this CPU needed another - grace period while RCU was idle. - -o "gpc" is the number of times that an old grace period had - completed, but this CPU was not yet aware of it. - -o "gps" is the number of times that a new grace period had started, - but this CPU was not yet aware of it. - -o "ndw" is the number of times that a wakeup of an rcuo - callback-offload kthread had to be deferred in order to avoid - deadlock. - -o "nn" is the number of times that this CPU needed nothing. - - -The output of "cat rcu/rcuboost" looks as follows: - -0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 - balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0 -4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 - balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0 - -This information is output only for rcu_preempt. Each two-line entry -corresponds to a leaf rcu_node structure. The fields are as follows: - -o "n:m" is the CPU-number range for the corresponding two-line - entry. In the sample output above, the first entry covers - CPUs zero through three and the second entry covers CPUs four - through seven. - -o "tasks=TNEB" gives the state of the various segments of the - rnp->blocked_tasks list: - - "T" This indicates that there are some tasks that blocked - while running on one of the corresponding CPUs while - in an RCU read-side critical section. - - "N" This indicates that some of the blocked tasks are preventing - the current normal (non-expedited) grace period from - completing. - - "E" This indicates that some of the blocked tasks are preventing - the current expedited grace period from completing. - - "B" This indicates that some of the blocked tasks are in - need of RCU priority boosting. - - Each character is replaced with "." if the corresponding - condition does not hold. - -o "kt" is the state of the RCU priority-boosting kernel - thread associated with the corresponding rcu_node structure. - The state can be one of the following: - - "S" The kernel thread is stopped, in other words, all - CPUs corresponding to this rcu_node structure are - offline. - - "R" The kernel thread is running. - - "W" The kernel thread is waiting because there is no work - for it to do. - - "Y" The kernel thread is yielding to avoid hogging CPU. - - "?" Unknown value, indicates a bug. - -o "ntb" is the number of tasks boosted. - -o "neb" is the number of tasks boosted in order to complete an - expedited grace period. - -o "nnb" is the number of tasks boosted in order to complete a - normal (non-expedited) grace period. When boosting a task - that was blocking both an expedited and a normal grace period, - it is counted against the expedited total above. - -o "j" is the low-order 16 bits of the jiffies counter in - hexadecimal. - -o "bt" is the low-order 16 bits of the value that the jiffies - counter will have when we next start boosting, assuming that - the current grace period does not end beforehand. This is - also in hexadecimal. - -o "balk: nt" counts the number of times we didn't boost (in - other words, we balked) even though it was time to boost because - there were no blocked tasks to boost. This situation occurs - when there is one blocked task on one rcu_node structure and - none on some other rcu_node structure. - -o "egt" counts the number of times we balked because although - there were blocked tasks, none of them were blocking the - current grace period, whether expedited or otherwise. - -o "bt" counts the number of times we balked because boosting - had already been initiated for the current grace period. - -o "nb" counts the number of times we balked because there - was at least one task blocking the current non-expedited grace - period that never had blocked. If it is already running, it - just won't help to boost its priority! - -o "ny" counts the number of times we balked because it was - not yet time to start boosting. - -o "nos" counts the number of times we balked for other - reasons, e.g., the grace period ended first. - - -CONFIG_TINY_RCU debugfs Files and Formats - -These implementations of RCU provides a single debugfs file under the -top-level directory RCU, namely rcu/rcudata, which displays fields in -rcu_bh_ctrlblk and rcu_sched_ctrlblk. - -The output of "cat rcu/rcudata" is as follows: - -rcu_sched: qlen: 0 -rcu_bh: qlen: 0 - -This is split into rcu_sched and rcu_bh sections. The field is as -follows: - -o "qlen" is the number of RCU callbacks currently waiting either - for an RCU grace period or waiting to be invoked. This is the - only field present for rcu_sched and rcu_bh, due to the - short-circuiting of grace period in those two cases. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b0a6cf0a7e14..2a0b7986a5dc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2147,6 +2147,12 @@ memmap=nn[KMG]@ss[KMG] [KNL] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. + If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], + which limits max address to nn[KMG]. + Multiple different regions can be specified, + comma delimited. + Example: + memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] [KNL,ACPI] Mark specific memory as ACPI data. @@ -2159,6 +2165,9 @@ memmap=64K$0x18690000 or memmap=0x10000$0x18690000 + Some bootloaders may need an escape character before '$', + like Grub2, otherwise '$' and the following number + will be eaten. memmap=nn[KMG]!ss[KMG] [KNL,X86] Mark specific memory as protected. @@ -3249,21 +3258,17 @@ rcutree.gp_cleanup_delay= [KNL] Set the number of jiffies to delay each step of - RCU grace-period cleanup. This only has effect - when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set. + RCU grace-period cleanup. rcutree.gp_init_delay= [KNL] Set the number of jiffies to delay each step of - RCU grace-period initialization. This only has - effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT - is set. + RCU grace-period initialization. rcutree.gp_preinit_delay= [KNL] Set the number of jiffies to delay each step of RCU grace-period pre-initialization, that is, the propagation of recent CPU-hotplug changes up - the rcu_node combining tree. This only has effect - when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set. + the rcu_node combining tree. rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining @@ -3339,6 +3344,17 @@ This wake_up() will be accompanied by a WARN_ONCE() splat and an ftrace_dump(). + rcuperf.gp_async= [KNL] + Measure performance of asynchronous + grace-period primitives such as call_rcu(). + + rcuperf.gp_async_max= [KNL] + Specify the maximum number of outstanding + callbacks per writer thread. When a writer + thread exceeds this limit, it invokes the + corresponding flavor of rcu_barrier() to allow + previously posted callbacks to drain. + rcuperf.gp_exp= [KNL] Measure performance of expedited synchronous grace-period primitives. @@ -3366,17 +3382,22 @@ rcuperf.perf_runnable= [BOOT] Start rcuperf running at boot time. + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + rcuperf.shutdown= [KNL] Shut the system down after performance tests complete. This is useful for hands-off automated testing. - rcuperf.perf_type= [KNL] - Specify the RCU implementation to test. - rcuperf.verbose= [KNL] Enable additional printk() statements. + rcuperf.writer_holdoff= [KNL] + Write-side holdoff between grace periods, + in microseconds. The default of zero says + no holdoff. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. @@ -3814,6 +3835,15 @@ spia_pedr= spia_peddr= + srcutree.counter_wrap_check [KNL] + Specifies how frequently to check for + grace-period sequence counter wrap for the + srcu_data structure's ->srcu_gp_seq_needed field. + The greater the number of bits set in this kernel + parameter, the less frequently counter wrap will + be checked for. Note that the bottom two bits + are ignored. + srcutree.exp_holdoff [KNL] Specifies how many nanoseconds must elapse since the end of the last SRCU grace period for diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst index 55e43f1c80de..fce929144ccd 100644 --- a/Documentation/core-api/atomic_ops.rst +++ b/Documentation/core-api/atomic_ops.rst @@ -303,6 +303,11 @@ defined which accomplish this:: void smp_mb__before_atomic(void); void smp_mb__after_atomic(void); +Preceding a non-value-returning read-modify-write atomic operation with +smp_mb__before_atomic() and following it with smp_mb__after_atomic() +provides the same full ordering that is provided by value-returning +read-modify-write atomic operations. + For example, smp_mb__before_atomic() can be used like so:: obj->dead = 1; diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst index ffdcc97f6f5a..78aa00a604a0 100644 --- a/Documentation/dev-tools/sparse.rst +++ b/Documentation/dev-tools/sparse.rst @@ -103,9 +103,3 @@ have already built it. The optional make variable CF can be used to pass arguments to sparse. The build system passes -Wbitwise to sparse automatically. - -Checking RCU annotations -~~~~~~~~~~~~~~~~~~~~~~~~ - -RCU annotations are not checked by default. To enable RCU annotation -checks, include -DCONFIG_SPARSE_RCU_POINTER in your CF flags. diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt index b73ca6cd07f8..195792270414 100644 --- a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt +++ b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt @@ -7,7 +7,11 @@ Required properties: - compatible : Must be one of "faraday,fttmr010" - "cortina,gemini-timer" + "cortina,gemini-timer", "faraday,fttmr010" + "moxa,moxart-timer", "faraday,fttmr010" + "aspeed,ast2400-timer" + "aspeed,ast2500-timer" + - reg : Should contain registers location and length - interrupts : Should contain the three timer interrupts usually with flags for falling edge diff --git a/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt b/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt deleted file mode 100644 index e207c11630af..000000000000 --- a/Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt +++ /dev/null @@ -1,19 +0,0 @@ -MOXA ART timer - -Required properties: - -- compatible : Must be one of: - - "moxa,moxart-timer" - - "aspeed,ast2400-timer" -- reg : Should contain registers location and length -- interrupts : Should contain the timer interrupt number -- clocks : Should contain phandle for the clock that drives the counter - -Example: - - timer: timer@98400000 { - compatible = "moxa,moxart-timer"; - reg = <0x98400000 0x42>; - interrupts = <19 1>; - clocks = <&coreclk>; - }; diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index ffbc8913cc76..5fe5759b8128 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -311,6 +311,8 @@ IRQ devm_irq_alloc_desc_at() devm_irq_alloc_desc_from() devm_irq_alloc_descs_from() + devm_irq_alloc_generic_chip() + devm_irq_setup_generic_chip() LED devm_led_classdev_register() diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index f10dd590f69f..8444dc3d57e8 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -316,7 +316,7 @@ For version 5, the format of the message is: struct autofs_v5_packet { int proto_version; /* Protocol version */ int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; __u32 dev; __u64 ino; __u32 uid; @@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing `O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at most one packet, and any unread portion of a packet will be discarded. -The `wait_queue_token` is a unique number which can identify a +The `wait_queue_entry_token` is a unique number which can identify a particular request to be acknowledged. When a message is sent over the pipe the affected dentry is marked as either "active" or "expiring" and other accesses to it block until the message is acknowledged using one of the ioctls below and the relevant -`wait_queue_token`. +`wait_queue_entry_token`. Communicating with autofs: root directory ioctls ------------------------------------------------ @@ -358,7 +358,7 @@ capability, or must be the automount daemon. The available ioctl commands are: - **AUTOFS_IOC_READY**: a notification has been handled. The argument - to the ioctl command is the "wait_queue_token" number + to the ioctl command is the "wait_queue_entry_token" number corresponding to the notification being acknowledged. - **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with the error code `ENOENT`. @@ -382,14 +382,14 @@ The available ioctl commands are: struct autofs_packet_expire_multi { int proto_version; /* Protocol version */ int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; is required. This is filled in with the name of something that can be unmounted or removed. If nothing can be expired, - `errno` is set to `EAGAIN`. Even though a `wait_queue_token` + `errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token` is present in the structure, no "wait queue" is established and no acknowledgment is needed. - **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst index 1a456b60a7cf..daf3883b2694 100644 --- a/Documentation/kernel-hacking/hacking.rst +++ b/Documentation/kernel-hacking/hacking.rst @@ -498,7 +498,7 @@ Wait Queues ``include/linux/wait.h`` A wait queue is used to wait for someone to wake you up when a certain condition is true. They must be used carefully to ensure there is no race condition. You declare a :c:type:`wait_queue_head_t`, and then processes -which want to wait for that condition declare a :c:type:`wait_queue_t` +which want to wait for that condition declare a :c:type:`wait_queue_entry_t` referring to themselves, and place that in the queue. Declaring diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index df31e30b6a02..2cb7dc5c0e0d 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -109,13 +109,12 @@ SCHED_SOFTIRQ: Do all of the following: on that CPU. If a thread that expects to run on the de-jittered CPU awakens, the scheduler will send an IPI that can result in a subsequent SCHED_SOFTIRQ. -2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, - CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU - to be de-jittered is marked as an adaptive-ticks CPU using the - "nohz_full=" boot parameter. This reduces the number of - scheduler-clock interrupts that the de-jittered CPU receives, - minimizing its chances of being selected to do the load balancing - work that runs in SCHED_SOFTIRQ context. +2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered + is marked as an adaptive-ticks CPU using the "nohz_full=" + boot parameter. This reduces the number of scheduler-clock + interrupts that the de-jittered CPU receives, minimizing its + chances of being selected to do the load balancing work that + runs in SCHED_SOFTIRQ context. 3. To the extent possible, keep the CPU out of the kernel when it is non-idle, for example, by avoiding system calls and by forcing both kernel threads and interrupts to execute elsewhere. @@ -135,11 +134,10 @@ HRTIMER_SOFTIRQ: Do all of the following: RCU_SOFTIRQ: Do at least one of the following: 1. Offload callbacks and keep the CPU in either dyntick-idle or adaptive-ticks state by doing all of the following: - a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, - CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU - to be de-jittered is marked as an adaptive-ticks CPU using - the "nohz_full=" boot parameter. Bind the rcuo kthreads - to housekeeping CPUs, which can tolerate OS jitter. + a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be + de-jittered is marked as an adaptive-ticks CPU using the + "nohz_full=" boot parameter. Bind the rcuo kthreads to + housekeeping CPUs, which can tolerate OS jitter. b. To the extent possible, keep the CPU out of the kernel when it is non-idle, for example, by avoiding system calls and by forcing both kernel threads and interrupts @@ -236,11 +234,10 @@ To reduce its OS jitter, do at least one of the following: is feasible only if your workload never requires RCU priority boosting, for example, if you ensure frequent idle time on all CPUs that might execute within the kernel. -3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y, - which offloads all RCU callbacks to kthreads that can be moved - off of CPUs susceptible to OS jitter. This approach prevents the - rcuc/%u kthreads from having any work to do, so that they are - never awakened. +3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= + boot parameter offloading RCU callbacks from all CPUs susceptible + to OS jitter. This approach prevents the rcuc/%u kthreads from + having any work to do, so that they are never awakened. 4. Ensure that the CPU never enters the kernel, and, in particular, avoid initiating any CPU hotplug operations on this CPU. This is another way of preventing any callbacks from being queued on the diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 732f10ea382e..9d5e0f853f08 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -27,7 +27,7 @@ The purpose of this document is twofold: (2) to provide a guide as to how to use the barriers that are available. Note that an architecture can provide more than the minimum requirement -for any particular barrier, but if the architecure provides less than +for any particular barrier, but if the architecture provides less than that, that architecture is incorrect. Note also that it is possible that a barrier may be a no-op for an diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index cbc1b46cbf70..e89e36ec15a5 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -7,6 +7,8 @@ CONTENTS 0. WARNING 1. Overview 2. Scheduling algorithm + 2.1 Main algorithm + 2.2 Bandwidth reclaiming 3. Scheduling Real-Time Tasks 3.1 Definitions 3.2 Schedulability Analysis for Uniprocessor Systems @@ -44,6 +46,9 @@ CONTENTS 2. Scheduling algorithm ================== +2.1 Main algorithm +------------------ + SCHED_DEADLINE uses three parameters, named "runtime", "period", and "deadline", to schedule tasks. A SCHED_DEADLINE task should receive "runtime" microseconds of execution time every "period" microseconds, and @@ -113,6 +118,160 @@ CONTENTS remaining runtime = remaining runtime + runtime +2.2 Bandwidth reclaiming +------------------------ + + Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy + Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled + when flag SCHED_FLAG_RECLAIM is set. + + The following diagram illustrates the state names for tasks handled by GRUB: + + ------------ + (d) | Active | + ------------->| | + | | Contending | + | ------------ + | A | + ---------- | | + | | | | + | Inactive | |(b) | (a) + | | | | + ---------- | | + A | V + | ------------ + | | Active | + --------------| Non | + (c) | Contending | + ------------ + + A task can be in one of the following states: + + - ActiveContending: if it is ready for execution (or executing); + + - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag + time; + + - Inactive: if it is blocked and has surpassed the 0-lag time. + + State transitions: + + (a) When a task blocks, it does not become immediately inactive since its + bandwidth cannot be immediately reclaimed without breaking the + real-time guarantees. It therefore enters a transitional state called + ActiveNonContending. The scheduler arms the "inactive timer" to fire at + the 0-lag time, when the task's bandwidth can be reclaimed without + breaking the real-time guarantees. + + The 0-lag time for a task entering the ActiveNonContending state is + computed as + + (runtime * dl_period) + deadline - --------------------- + dl_runtime + + where runtime is the remaining runtime, while dl_runtime and dl_period + are the reservation parameters. + + (b) If the task wakes up before the inactive timer fires, the task re-enters + the ActiveContending state and the "inactive timer" is canceled. + In addition, if the task wakes up on a different runqueue, then + the task's utilization must be removed from the previous runqueue's active + utilization and must be added to the new runqueue's active utilization. + In order to avoid races between a task waking up on a runqueue while the + "inactive timer" is running on a different CPU, the "dl_non_contending" + flag is used to indicate that a task is not on a runqueue but is active + (so, the flag is set when the task blocks and is cleared when the + "inactive timer" fires or when the task wakes up). + + (c) When the "inactive timer" fires, the task enters the Inactive state and + its utilization is removed from the runqueue's active utilization. + + (d) When an inactive task wakes up, it enters the ActiveContending state and + its utilization is added to the active utilization of the runqueue where + it has been enqueued. + + For each runqueue, the algorithm GRUB keeps track of two different bandwidths: + + - Active bandwidth (running_bw): this is the sum of the bandwidths of all + tasks in active state (i.e., ActiveContending or ActiveNonContending); + + - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the + runqueue, including the tasks in Inactive state. + + + The algorithm reclaims the bandwidth of the tasks in Inactive state. + It does so by decrementing the runtime of the executing task Ti at a pace equal + to + + dq = -max{ Ui, (1 - Uinact) } dt + + where Uinact is the inactive utilization, computed as (this_bq - running_bw), + and Ui is the bandwidth of task Ti. + + + Let's now see a trivial example of two deadline tasks with runtime equal + to 4 and period equal to 8 (i.e., bandwidth equal to 0.5): + + A Task T1 + | + | | + | | + |-------- |---- + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A Task T2 + | + | | + | | + | ------------------------| + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A running_bw + | + 1 ----------------- ------ + | | | + 0.5- ----------------- + | | + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + - Time t = 0: + + Both tasks are ready for execution and therefore in ActiveContending state. + Suppose Task T1 is the first task to start execution. + Since there are no inactive tasks, its runtime is decreased as dq = -1 dt. + + - Time t = 2: + + Suppose that task T1 blocks + Task T1 therefore enters the ActiveNonContending state. Since its remaining + runtime is equal to 2, its 0-lag time is equal to t = 4. + Task T2 start execution, with runtime still decreased as dq = -1 dt since + there are no inactive tasks. + + - Time t = 4: + + This is the 0-lag time for Task T1. Since it didn't woken up in the + meantime, it enters the Inactive state. Its bandwidth is removed from + running_bw. + Task T2 continues its execution. However, its runtime is now decreased as + dq = - 0.5 dt because Uinact = 0.5. + Task T2 therefore reclaims the bandwidth unused by Task T1. + + - Time t = 8: + + Task T1 wakes up. It enters the ActiveContending state again, and the + running_bw is incremented. + + 3. Scheduling Real-Time Tasks ============================= @@ -330,6 +489,15 @@ CONTENTS 14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for Global EDF. Proceedings of the 22nd Euromicro Conference on Real-Time Systems, 2010. + 15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in + constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time + Systems, 2000. + 16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for + SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS), + Dusseldorf, Germany, 2014. + 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel + or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied + Computing, 2016. 4. Bandwidth management diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt index 6eaf576294f3..2dcaf9adb7a7 100644 --- a/Documentation/timers/NO_HZ.txt +++ b/Documentation/timers/NO_HZ.txt @@ -194,32 +194,9 @@ that the RCU callbacks are processed in a timely fashion. Another approach is to offload RCU callback processing to "rcuo" kthreads using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to -offload may be selected via several methods: - -1. One of three mutually exclusive Kconfig options specify a - build-time default for the CPUs to offload: - - a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in - no CPUs being offloaded. - - b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes - CPU 0 to be offloaded. - - c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all - CPUs to be offloaded. Note that the callbacks will be - offloaded to "rcuo" kthreads, and that those kthreads - will in fact run on some CPU. However, this approach - gives fine-grained control on exactly which CPUs the - callbacks run on, along with their scheduling priority - (including the default of SCHED_OTHER), and it further - allows this control to be varied dynamically at runtime. - -2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated - list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1, - 3, 4, and 5. The specified CPUs will be offloaded in addition to - any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or - CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot - parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y. +offload may be selected using The "rcu_nocbs=" kernel boot parameter, +which takes a comma-separated list of CPUs and CPU ranges, for example, +"1,3-5" selects CPUs 1, 3, 4, and 5. The offloaded CPUs will never queue RCU callbacks, and therefore RCU never prevents offloaded CPUs from entering either dyntick-idle mode diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 94a987bd2bc5..fff8ff6d4893 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set. <idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update <idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz <idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock - <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit + <idle>-0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit <idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit <idle>-0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit <idle>-0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 61b611e9eeaf..b297c48389b9 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -36,7 +36,8 @@ Machine check to broadcast MCEs. mce=bootlog Enable logging of machine checks left over from booting. - Disabled by default on AMD because some BIOS leave bogus ones. + Disabled by default on AMD Fam10h and older because some BIOS + leave bogus ones. If your BIOS doesn't do that it's a good idea to enable though to make sure you log even machine check events that result in a reboot. On Intel systems it is enabled by default. diff --git a/MAINTAINERS b/MAINTAINERS index 1436b17320ad..316232203df4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2347,6 +2347,15 @@ F: Documentation/devicetree/bindings/input/atmel,maxtouch.txt F: drivers/input/touchscreen/atmel_mxt_ts.c F: include/linux/platform_data/atmel_mxt_ts.h +ATOMIC INFRASTRUCTURE +M: Will Deacon <will.deacon@arm.com> +M: Peter Zijlstra <peterz@infradead.org> +R: Boqun Feng <boqun.feng@gmail.com> +L: linux-kernel@vger.kernel.org +S: Maintained +F: arch/*/include/asm/atomic*.h +F: include/*/atomic*.h + ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER M: Bradley Grove <linuxdrivers@attotech.com> L: linux-scsi@vger.kernel.org @@ -7654,7 +7663,7 @@ S: Maintained F: drivers/ata/sata_promise.* LIBLOCKDEP -M: Sasha Levin <sasha.levin@oracle.com> +M: Sasha Levin <alexander.levin@verizon.com> S: Maintained F: tools/lib/lockdep/ diff --git a/arch/Kconfig b/arch/Kconfig index 3abe581878d6..0e01f3a34cb4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -867,4 +867,13 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked + implementation, which can be (slightly) slower but provides protections + against various use-after-free conditions that can be used in + security flaw exploits. + source "kernel/gcov/Kconfig" diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 7bee4e4799fd..3e74ca5e6402 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -36,7 +36,6 @@ generic-y += preempt.h generic-y += resource.h generic-y += sembuf.h generic-y += shmbuf.h -generic-y += siginfo.h generic-y += socket.h generic-y += sockios.h generic-y += stat.h diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/arc/include/uapi/asm/Kbuild +++ b/arch/arc/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index de29ea9b6408..c4ffb441716c 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -470,7 +470,7 @@ void __init setup_arch(char **cmdline_p) void __init time_init(void) { of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } static int __init customize_machine(void) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4f4738934b12..6fb8f39c98d5 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -338,7 +338,7 @@ config ARCH_MULTIPLATFORM select ARM_HAS_SG_CHAIN select ARM_PATCH_PHYS_VIRT select AUTO_ZRELADDR - select CLKSRC_OF + select TIMER_OF select COMMON_CLK select GENERIC_CLOCKEVENTS select MIGHT_HAVE_PCI @@ -352,7 +352,7 @@ config ARM_SINGLE_ARMV7M depends on !MMU select ARM_NVIC select AUTO_ZRELADDR - select CLKSRC_OF + select TIMER_OF select COMMON_CLK select CPU_V7M select GENERIC_CLOCKEVENTS @@ -533,7 +533,7 @@ config ARCH_PXA select CLKDEV_LOOKUP select CLKSRC_PXA select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF select CPU_XSCALE if !CPU_XSC3 select GENERIC_CLOCKEVENTS select GPIO_PXA @@ -572,7 +572,7 @@ config ARCH_SA1100 select CLKDEV_LOOKUP select CLKSRC_MMIO select CLKSRC_PXA - select CLKSRC_OF if OF + select TIMER_OF if OF select CPU_FREQ select CPU_SA1100 select GENERIC_CLOCKEVENTS @@ -1360,7 +1360,7 @@ config HAVE_ARM_ARCH_TIMER config HAVE_ARM_TWD bool - select CLKSRC_OF if OF + select TIMER_OF if OF help This options enables support for the ARM timer and watchdog unit @@ -1642,7 +1642,7 @@ config ARCH_SELECT_MEMORY_MODEL config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y depends on ARM_LPAE @@ -2066,6 +2066,23 @@ config EFI is only useful for kernels that may run on systems that have UEFI firmware. +config DMI + bool "Enable support for SMBIOS (DMI) tables" + depends on EFI + default y + help + This enables SMBIOS/DMI feature for systems. + + This option is only useful on systems that have UEFI firmware. + However, even with this option, the resultant kernel should + continue to boot on existing non-UEFI platforms. + + NOTE: This does *NOT* enable or encourage the use of DMI quirks, + i.e., the the practice of identifying the platform via DMI to + decide whether certain workarounds for buggy hardware and/or + firmware need to be enabled. This would require the DMI subsystem + to be enabled much earlier than we do on ARM, which is non-trivial. + endmenu menu "CPU Power Management" diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 3a36d99ff836..d8360501c082 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -28,7 +28,6 @@ generic-y += segment.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h -generic-y += siginfo.h generic-y += simd.h generic-y += sizes.h generic-y += socket.h diff --git a/arch/arm/include/asm/dmi.h b/arch/arm/include/asm/dmi.h new file mode 100644 index 000000000000..df2d2ff06f5b --- /dev/null +++ b/arch/arm/include/asm/dmi.h @@ -0,0 +1,19 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_DMI_H +#define __ASM_DMI_H + +#include <linux/io.h> +#include <linux/slab.h> + +#define dmi_early_remap(x, l) memremap(x, l, MEMREMAP_WB) +#define dmi_early_unmap(x, l) memunmap(x) +#define dmi_remap(x, l) memremap(x, l, MEMREMAP_WB) +#define dmi_unmap(x) memunmap(x) +#define dmi_alloc(l) kzalloc(l, GFP_KERNEL) + +#endif diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index 424935e4515d..5fb3368e70cb 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -4,3 +4,5 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd-common.h generated-y += unistd-oabi.h generated-y += unistd-eabi.h + +generic-y += siginfo.h diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index be3b3fbd382f..af2a7f1e3103 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1090,7 +1090,7 @@ static int __init arch_hw_breakpoint_init(void) * driven low on this core and there isn't an architected way to * determine that. */ - get_online_cpus(); + cpus_read_lock(); register_undef_hook(&debug_reg_hook); /* @@ -1098,15 +1098,16 @@ static int __init arch_hw_breakpoint_init(void) * assume that a halting debugger will leave the world in a nice state * for us. */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm/hw_breakpoint:online", - dbg_reset_online, NULL); + ret = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/hw_breakpoint:online", + dbg_reset_online, NULL); unregister_undef_hook(&debug_reg_hook); if (WARN_ON(ret < 0) || !cpumask_empty(&debug_err_mask)) { core_num_brps = 0; core_num_wrps = 0; if (ret > 0) - cpuhp_remove_state_nocalls(ret); - put_online_cpus(); + cpuhp_remove_state_nocalls_cpuslocked(ret); + cpus_read_unlock(); return 0; } @@ -1124,7 +1125,7 @@ static int __init arch_hw_breakpoint_init(void) TRAP_HWBKPT, "watchpoint debug exception"); hook_ifault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP, TRAP_HWBKPT, "breakpoint debug exception"); - put_online_cpus(); + cpus_read_unlock(); /* Register PM notifiers. */ pm_init(); diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index 020560b2dcb7..a1a34722c655 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -124,5 +124,5 @@ void __kprobes patch_text(void *addr, unsigned int insn) .insn = insn, }; - stop_machine(patch_text_stop_machine, &patch, NULL); + stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL); } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 572a8df1b766..c9a0a5299827 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock); */ static void ipi_cpu_stop(unsigned int cpu) { - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { raw_spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index 895ae5197159..b30eafeef096 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -403,7 +403,7 @@ out: WARN(err, "twd_local_timer_of_register failed (%d)\n", err); return err; } -CLOCKSOURCE_OF_DECLARE(arm_twd_a9, "arm,cortex-a9-twd-timer", twd_local_timer_of_register); -CLOCKSOURCE_OF_DECLARE(arm_twd_a5, "arm,cortex-a5-twd-timer", twd_local_timer_of_register); -CLOCKSOURCE_OF_DECLARE(arm_twd_11mp, "arm,arm11mp-twd-timer", twd_local_timer_of_register); +TIMER_OF_DECLARE(arm_twd_a9, "arm,cortex-a9-twd-timer", twd_local_timer_of_register); +TIMER_OF_DECLARE(arm_twd_a5, "arm,cortex-a5-twd-timer", twd_local_timer_of_register); +TIMER_OF_DECLARE(arm_twd_11mp, "arm,arm11mp-twd-timer", twd_local_timer_of_register); #endif diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 97b22fa7cb3a..629f8e9981f1 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -120,6 +120,6 @@ void __init time_init(void) #ifdef CONFIG_COMMON_CLK of_clk_init(NULL); #endif - clocksource_probe(); + timer_probe(); } } diff --git a/arch/arm/mach-aspeed/Kconfig b/arch/arm/mach-aspeed/Kconfig index f3f8c5c658db..2d5570e6e186 100644 --- a/arch/arm/mach-aspeed/Kconfig +++ b/arch/arm/mach-aspeed/Kconfig @@ -4,7 +4,7 @@ menuconfig ARCH_ASPEED select SRAM select WATCHDOG select ASPEED_WATCHDOG - select MOXART_TIMER + select FTTMR010_TIMER select MFD_SYSCON select PINCTRL help diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig index abfce8f29ca1..73be3d578851 100644 --- a/arch/arm/mach-bcm/Kconfig +++ b/arch/arm/mach-bcm/Kconfig @@ -152,7 +152,7 @@ config ARCH_BCM2835 select ARM_ERRATA_411920 if ARCH_MULTI_V6 select ARM_TIMER_SP804 select HAVE_ARM_ARCH_TIMER if ARCH_MULTI_V7 - select CLKSRC_OF + select TIMER_OF select BCM2835_TIMER select PINCTRL select PINCTRL_BCM2835 diff --git a/arch/arm/mach-clps711x/Kconfig b/arch/arm/mach-clps711x/Kconfig index 61284b9389cf..f385b1fcafef 100644 --- a/arch/arm/mach-clps711x/Kconfig +++ b/arch/arm/mach-clps711x/Kconfig @@ -2,7 +2,7 @@ menuconfig ARCH_CLPS711X bool "Cirrus Logic EP721x/EP731x-based" depends on ARCH_MULTI_V4T select AUTO_ZRELADDR - select CLKSRC_OF + select TIMER_OF select CLPS711X_TIMER select COMMON_CLK select CPU_ARM720T diff --git a/arch/arm/mach-mediatek/mediatek.c b/arch/arm/mach-mediatek/mediatek.c index a6e3c98b95ed..c3cf215773b2 100644 --- a/arch/arm/mach-mediatek/mediatek.c +++ b/arch/arm/mach-mediatek/mediatek.c @@ -41,7 +41,7 @@ static void __init mediatek_timer_init(void) } of_clk_init(NULL); - clocksource_probe(); + timer_probe(); }; static const char * const mediatek_board_dt_compat[] = { diff --git a/arch/arm/mach-moxart/Kconfig b/arch/arm/mach-moxart/Kconfig index 70db2abf6163..a4a91f9a3301 100644 --- a/arch/arm/mach-moxart/Kconfig +++ b/arch/arm/mach-moxart/Kconfig @@ -4,7 +4,7 @@ menuconfig ARCH_MOXART select CPU_FA526 select ARM_DMA_MEM_BUFFERABLE select FARADAY_FTINTC010 - select MOXART_TIMER + select FTTMR010_TIMER select GPIOLIB select PHYLIB if NETDEVICES help diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c index 6a861c0ac343..ece09c9461f7 100644 --- a/arch/arm/mach-omap2/timer.c +++ b/arch/arm/mach-omap2/timer.c @@ -487,7 +487,7 @@ void __init omap_init_time(void) __omap_sync32k_timer_init(1, "timer_32k_ck", "ti,timer-alwon", 2, "timer_sys_ck", NULL, false); - clocksource_probe(); + timer_probe(); } #if defined(CONFIG_ARCH_OMAP3) || defined(CONFIG_SOC_AM43XX) @@ -496,7 +496,7 @@ void __init omap3_secure_sync32k_timer_init(void) __omap_sync32k_timer_init(12, "secure_32k_fck", "ti,timer-secure", 2, "timer_sys_ck", NULL, false); - clocksource_probe(); + timer_probe(); } #endif /* CONFIG_ARCH_OMAP3 */ @@ -507,7 +507,7 @@ void __init omap3_gptimer_timer_init(void) __omap_sync32k_timer_init(2, "timer_sys_ck", NULL, 1, "timer_sys_ck", "ti,timer-alwon", true); if (of_have_populated_dt()) - clocksource_probe(); + timer_probe(); } #endif @@ -522,7 +522,7 @@ static void __init omap4_sync32k_timer_init(void) void __init omap4_local_timer_init(void) { omap4_sync32k_timer_init(); - clocksource_probe(); + timer_probe(); } #endif @@ -646,7 +646,7 @@ void __init omap5_realtime_timer_init(void) omap4_sync32k_timer_init(); realtime_counter_init(); - clocksource_probe(); + timer_probe(); } #endif /* CONFIG_SOC_OMAP5 || CONFIG_SOC_DRA7XX */ diff --git a/arch/arm/mach-rockchip/rockchip.c b/arch/arm/mach-rockchip/rockchip.c index 927cf563ea47..e41cabc4dc2b 100644 --- a/arch/arm/mach-rockchip/rockchip.c +++ b/arch/arm/mach-rockchip/rockchip.c @@ -55,7 +55,7 @@ static void __init rockchip_timer_init(void) } of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } static void __init rockchip_dt_init(void) diff --git a/arch/arm/mach-s3c24xx/Kconfig b/arch/arm/mach-s3c24xx/Kconfig index 4b1690acb6a5..f07da82ebfea 100644 --- a/arch/arm/mach-s3c24xx/Kconfig +++ b/arch/arm/mach-s3c24xx/Kconfig @@ -394,7 +394,7 @@ config MACH_SMDK2416 config MACH_S3C2416_DT bool "Samsung S3C2416 machine using devicetree" - select CLKSRC_OF + select TIMER_OF select USE_OF select PINCTRL select PINCTRL_S3C24XX diff --git a/arch/arm/mach-s3c64xx/Kconfig b/arch/arm/mach-s3c64xx/Kconfig index 5ee5ad74a3d6..afd1f20be49e 100644 --- a/arch/arm/mach-s3c64xx/Kconfig +++ b/arch/arm/mach-s3c64xx/Kconfig @@ -335,7 +335,7 @@ config MACH_WLF_CRAGG_6410 config MACH_S3C64XX_DT bool "Samsung S3C6400/S3C6410 machine using Device Tree" - select CLKSRC_OF + select TIMER_OF select CPU_S3C6400 select CPU_S3C6410 select PINCTRL diff --git a/arch/arm/mach-shmobile/setup-rcar-gen2.c b/arch/arm/mach-shmobile/setup-rcar-gen2.c index 52d466b75973..a6e74f481dea 100644 --- a/arch/arm/mach-shmobile/setup-rcar-gen2.c +++ b/arch/arm/mach-shmobile/setup-rcar-gen2.c @@ -113,7 +113,7 @@ void __init rcar_gen2_timer_init(void) #endif /* CONFIG_ARM_ARCH_TIMER */ of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } struct memory_reserve_config { diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c index ca2f6a82a414..31c43cabf362 100644 --- a/arch/arm/mach-spear/spear13xx.c +++ b/arch/arm/mach-spear/spear13xx.c @@ -124,5 +124,5 @@ void __init spear13xx_timer_init(void) clk_put(pclk); spear_setup_of_timer(); - clocksource_probe(); + timer_probe(); } diff --git a/arch/arm/mach-sunxi/sunxi.c b/arch/arm/mach-sunxi/sunxi.c index f44e3acb5c90..7ab353fb25f2 100644 --- a/arch/arm/mach-sunxi/sunxi.c +++ b/arch/arm/mach-sunxi/sunxi.c @@ -42,7 +42,7 @@ static void __init sun6i_timer_init(void) of_clk_init(NULL); if (IS_ENABLED(CONFIG_RESET_CONTROLLER)) sun6i_reset_init(); - clocksource_probe(); + timer_probe(); } DT_MACHINE_START(SUN6I_DT, "Allwinner sun6i (A31) Family") diff --git a/arch/arm/mach-u300/core.c b/arch/arm/mach-u300/core.c index a4910ea6811a..048f15e8c669 100644 --- a/arch/arm/mach-u300/core.c +++ b/arch/arm/mach-u300/core.c @@ -407,7 +407,7 @@ static const char * u300_board_compat[] = { DT_MACHINE_START(U300_DT, "U300 S335/B335 (Device Tree)") .map_io = u300_map_io, .init_irq = u300_init_irq_dt, - .init_time = clocksource_probe, + .init_time = timer_probe, .init_machine = u300_init_machine_dt, .restart = u300_restart, .dt_compat = u300_board_compat, diff --git a/arch/arm/mach-zynq/common.c b/arch/arm/mach-zynq/common.c index ed118648313f..6aba9ebf8041 100644 --- a/arch/arm/mach-zynq/common.c +++ b/arch/arm/mach-zynq/common.c @@ -150,7 +150,7 @@ static void __init zynq_timer_init(void) { zynq_clock_init(); of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } static struct map_desc zynq_cortex_a9_scu_map __initdata = { diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index ad1f4e6a9e33..52d1cd14fda4 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -182,7 +182,8 @@ void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn) .addr = addr, .insn = insn, }; - stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask); + stop_machine_cpuslocked(__kprobes_remove_breakpoint, &p, + cpu_online_mask); } void __kprobes arch_disarm_kprobe(struct kprobe *p) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d2636b252699..3cfb508b37f0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -207,7 +207,7 @@ config GENERIC_CALIBRATE_DELAY config ZONE_DMA def_bool y -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y config ARCH_DMA_ADDR_T_64BIT diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 319956fa3908..f5f0c813dfec 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -24,7 +24,7 @@ config ARCH_ALPINE config ARCH_BCM2835 bool "Broadcom BCM2835 family" - select CLKSRC_OF + select TIMER_OF select GPIOLIB select PINCTRL select PINCTRL_BCM2835 @@ -194,7 +194,7 @@ config ARCH_TEGRA select ARCH_HAS_RESET_CONTROLLER select CLKDEV_LOOKUP select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF select GENERIC_CLOCKEVENTS select GPIOLIB select PINCTRL diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 29cb2ca756f6..4214c38d016b 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -433,7 +433,6 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset); bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); -int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); s32 aarch64_insn_adrp_get_offset(u32 insn); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 5d17f377d905..82cd07592519 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -11,7 +11,6 @@ * */ -#include <linux/dmi.h> #include <linux/efi.h> #include <linux/init.h> @@ -117,20 +116,6 @@ int __init efi_set_mapping_permissions(struct mm_struct *mm, set_permissions, md); } -static int __init arm64_dmi_init(void) -{ - /* - * On arm64, DMI depends on UEFI, and dmi_scan_machine() needs to - * be called early because dmi_id_init(), which is an arch_initcall - * itself, depends on dmi_scan_machine() having been called already. - */ - dmi_scan_machine(); - if (dmi_available) - dmi_set_dump_stack_arch_desc(); - return 0; -} -core_initcall(arm64_dmi_init); - /* * UpdateCapsule() depends on the system being shutdown via * ResetSystem(). diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index b884a926a632..cd872133e88e 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -255,6 +255,7 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg) return ret; } +static int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) { struct aarch64_insn_patch patch = { @@ -267,8 +268,8 @@ int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) if (cnt <= 0) return -EINVAL; - return stop_machine(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); } int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6e0e16a3a7d4..321119881abf 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -961,8 +961,7 @@ void smp_send_stop(void) cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) + if (system_state <= SYSTEM_RUNNING) pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_STOP); } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 59779699a1a4..da33c90248e9 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -70,7 +70,7 @@ void __init time_init(void) u32 arch_timer_rate; of_clk_init(NULL); - clocksource_probe(); + timer_probe(); tick_setup_hrtimer_broadcast(); diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index ef3bdfd6be18..e8f759f764f2 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -220,10 +220,8 @@ void update_vsyscall(struct timekeeper *tk) if (!use_syscall) { /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_time.tv_sec; - vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec << - tk->tkr_raw.shift) + - tk->tkr_raw.xtime_nsec; + vdso_data->raw_time_sec = tk->raw_sec; + vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; vdso_data->cs_mono_mult = tk->tkr_mono.mult; diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index f0eaf0475e7e..a3c8d05c4cc7 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -45,7 +45,6 @@ generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += signal.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild index 13a97aa2285f..1c44d3b3eba0 100644 --- a/arch/c6x/include/uapi/asm/Kbuild +++ b/arch/c6x/include/uapi/asm/Kbuild @@ -2,3 +2,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h +generic-y += siginfo.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 2890099992a9..acc5781100c2 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -36,7 +36,6 @@ generic-y += resource.h generic-y += sections.h generic-y += sembuf.h generic-y += shmbuf.h -generic-y += siginfo.h generic-y += socket.h generic-y += sockios.h generic-y += statfs.h diff --git a/arch/cris/include/uapi/asm/Kbuild b/arch/cris/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/cris/include/uapi/asm/Kbuild +++ b/arch/cris/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 3ae852507e57..6e3d36f37a02 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -15,7 +15,7 @@ config H8300 select OF_IRQ select OF_EARLY_FLATTREE select HAVE_MEMBLOCK - select CLKSRC_OF + select TIMER_OF select H8300_TMR8 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 757cdeb24e6e..99c824608a31 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -54,7 +54,6 @@ generic-y += serial.h generic-y += setup.h generic-y += shmbuf.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += sizes.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/h8300/include/uapi/asm/Kbuild +++ b/arch/h8300/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index c8c25a4e9e48..6be15d634650 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -246,5 +246,5 @@ void __init calibrate_delay(void) void __init time_init(void) { of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 6b45ef79eb8f..0fc9cb04e6ad 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -41,7 +41,6 @@ generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += sizes.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/hexagon/include/uapi/asm/Kbuild +++ b/arch/hexagon/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h deleted file mode 100644 index 6f2e2dd0f28f..000000000000 --- a/arch/ia64/include/asm/siginfo.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Based on <asm-i386/siginfo.h>. - * - * Modified 1998-2002 - * David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co - */ -#ifndef _ASM_IA64_SIGINFO_H -#define _ASM_IA64_SIGINFO_H - -#include <linux/string.h> -#include <uapi/asm/siginfo.h> - -static inline void -copy_siginfo (siginfo_t *to, siginfo_t *from) -{ - if (from->si_code < 0) - memcpy(to, from, sizeof(siginfo_t)); - else - /* _sigchld is currently the largest know union member */ - memcpy(to, from, 4*sizeof(int) + sizeof(from->_sifields._sigchld)); -} - -#endif /* _ASM_IA64_SIGINFO_H */ diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h index f72bf0172bb2..4694c64252d6 100644 --- a/arch/ia64/include/uapi/asm/siginfo.h +++ b/arch/ia64/include/uapi/asm/siginfo.h @@ -11,7 +11,6 @@ #define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) #define HAVE_ARCH_SIGINFO_T -#define HAVE_ARCH_COPY_SIGINFO #define HAVE_ARCH_COPY_SIGINFO_TO_USER #include <asm-generic/siginfo.h> diff --git a/arch/m32r/include/uapi/asm/Kbuild b/arch/m32r/include/uapi/asm/Kbuild index b15bf6bc0e94..c94ee54210bc 100644 --- a/arch/m32r/include/uapi/asm/Kbuild +++ b/arch/m32r/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/m32r/include/uapi/asm/siginfo.h b/arch/m32r/include/uapi/asm/siginfo.h deleted file mode 100644 index 7d9cd9ebfd0e..000000000000 --- a/arch/m32r/include/uapi/asm/siginfo.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _M32R_SIGINFO_H -#define _M32R_SIGINFO_H - -#include <asm-generic/siginfo.h> - -#endif /* _M32R_SIGINFO_H */ diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 82005d2ff717..5ecf4e47b2e2 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -25,7 +25,6 @@ generic-y += preempt.h generic-y += resource.h generic-y += sections.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += spinlock.h generic-y += statfs.h generic-y += termios.h diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild index 64368077235a..68b45cc87e2c 100644 --- a/arch/m68k/include/uapi/asm/Kbuild +++ b/arch/m68k/include/uapi/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += auxvec.h generic-y += msgbuf.h generic-y += sembuf.h generic-y += shmbuf.h +generic-y += siginfo.h generic-y += socket.h generic-y += sockios.h generic-y += termbits.h diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c index 232a12bf3f99..2dbbb7c66043 100644 --- a/arch/metag/kernel/smp.c +++ b/arch/metag/kernel/smp.c @@ -567,8 +567,7 @@ static void stop_this_cpu(void *data) { unsigned int cpu = smp_processor_id(); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 85885a501dce..8e47121b8b8b 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -4,7 +4,7 @@ config MICROBLAZE select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT - select CLKSRC_OF + select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK select GENERIC_ATOMIC64 diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index 2178c78c7c1a..cb6784f4a427 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -2,3 +2,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += types.h +generic-y += siginfo.h diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index f31ebb5dc26c..be98ffe28ca8 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -192,7 +192,7 @@ void __init time_init(void) { of_clk_init(NULL); setup_cpuinfo_clk(); - clocksource_probe(); + timer_probe(); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index 545ccd46edb3..ea2d83f1f4bb 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -335,5 +335,5 @@ static int __init xilinx_timer_init(struct device_node *timer) return 0; } -CLOCKSOURCE_OF_DECLARE(xilinx_timer, "xlnx,xps-timer-1.00.a", +TIMER_OF_DECLARE(xilinx_timer, "xlnx,xps-timer-1.00.a", xilinx_timer_init); diff --git a/arch/mips/generic/init.c b/arch/mips/generic/init.c index 4a9a1edbfb29..3f32b376d30e 100644 --- a/arch/mips/generic/init.c +++ b/arch/mips/generic/init.c @@ -188,7 +188,7 @@ void __init plat_time_init(void) } } - clocksource_probe(); + timer_probe(); } void __init arch_init_irq(void) diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c index 3e586daa3a32..32e3168316cd 100644 --- a/arch/mips/kernel/jump_label.c +++ b/arch/mips/kernel/jump_label.c @@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e, insn.word = 0; /* nop */ } - get_online_cpus(); mutex_lock(&text_mutex); if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) { insn_p->halfword[0] = insn.word >> 16; @@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e, (unsigned long)insn_p + sizeof(*insn_p)); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif /* HAVE_JUMP_LABEL */ diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 289edcfadd7c..cea4ec909806 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -265,7 +265,7 @@ void __init plat_time_init(void) (freq%1000000)*100/1000000); #ifdef CONFIG_CLKSRC_MIPS_GIC update_gic_frequency_dt(); - clocksource_probe(); + timer_probe(); #endif } #endif diff --git a/arch/mips/pic32/pic32mzda/time.c b/arch/mips/pic32/pic32mzda/time.c index 62a0a78b6c64..1894e50939b5 100644 --- a/arch/mips/pic32/pic32mzda/time.c +++ b/arch/mips/pic32/pic32mzda/time.c @@ -64,5 +64,5 @@ void __init plat_time_init(void) pr_info("CPU Clock: %ldMHz\n", rate / 1000000); mips_hpt_frequency = rate / 2; - clocksource_probe(); + timer_probe(); } diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c index 1022201b2beb..17a0f1dec05b 100644 --- a/arch/mips/pistachio/time.c +++ b/arch/mips/pistachio/time.c @@ -39,7 +39,7 @@ void __init plat_time_init(void) struct clk *clk; of_clk_init(NULL); - clocksource_probe(); + timer_probe(); np = of_get_cpu_node(0, NULL); if (!np) { diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index 9825dee10bc1..710b04cf4851 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -4,7 +4,7 @@ config CLKEVT_RT3352 bool depends on SOC_RT305X || SOC_MT7620 default y - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO config RALINK_ILL_ACC diff --git a/arch/mips/ralink/cevt-rt3352.c b/arch/mips/ralink/cevt-rt3352.c index b8a1376165b0..92f284d2b802 100644 --- a/arch/mips/ralink/cevt-rt3352.c +++ b/arch/mips/ralink/cevt-rt3352.c @@ -152,4 +152,4 @@ static int __init ralink_systick_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(systick, "ralink,cevt-systick", ralink_systick_init); +TIMER_OF_DECLARE(systick, "ralink,cevt-systick", ralink_systick_init); diff --git a/arch/mips/ralink/clk.c b/arch/mips/ralink/clk.c index df795885eace..eb1c61917eb7 100644 --- a/arch/mips/ralink/clk.c +++ b/arch/mips/ralink/clk.c @@ -82,5 +82,5 @@ void __init plat_time_init(void) pr_info("CPU Clock: %ldMHz\n", clk_get_rate(clk) / 1000000); mips_hpt_frequency = clk_get_rate(clk) / 2; clk_put(clk); - clocksource_probe(); + timer_probe(); } diff --git a/arch/mips/ralink/timer-gic.c b/arch/mips/ralink/timer-gic.c index 069771dbec42..b5f07d21fcf2 100644 --- a/arch/mips/ralink/timer-gic.c +++ b/arch/mips/ralink/timer-gic.c @@ -20,5 +20,5 @@ void __init plat_time_init(void) ralink_of_remap(); of_clk_init(NULL); - clocksource_probe(); + timer_probe(); } diff --git a/arch/mips/xilfpga/time.c b/arch/mips/xilfpga/time.c index cbb3fca7b6fa..36f3f1870ee2 100644 --- a/arch/mips/xilfpga/time.c +++ b/arch/mips/xilfpga/time.c @@ -22,7 +22,7 @@ void __init plat_time_init(void) struct clk *clk; of_clk_init(NULL); - clocksource_probe(); + timer_probe(); np = of_get_cpu_node(0, NULL); if (!np) { diff --git a/arch/mn10300/include/uapi/asm/Kbuild b/arch/mn10300/include/uapi/asm/Kbuild index b15bf6bc0e94..c94ee54210bc 100644 --- a/arch/mn10300/include/uapi/asm/Kbuild +++ b/arch/mn10300/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/mn10300/include/uapi/asm/siginfo.h b/arch/mn10300/include/uapi/asm/siginfo.h deleted file mode 100644 index 0815d29d82e5..000000000000 --- a/arch/mn10300/include/uapi/asm/siginfo.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/siginfo.h> diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index a72d5f0de692..c587764b9c5a 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -1,6 +1,6 @@ config NIOS2 def_bool y - select CLKSRC_OF + select TIMER_OF select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 727dbb333f60..e1a843def56f 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -47,7 +47,6 @@ generic-y += segment.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h -generic-y += siginfo.h generic-y += signal.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild index 374bd123329f..51eff5bc2eb4 100644 --- a/arch/nios2/include/uapi/asm/Kbuild +++ b/arch/nios2/include/uapi/asm/Kbuild @@ -2,4 +2,5 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += setup.h +generic-y += siginfo.h generic-y += ucontext.h diff --git a/arch/nios2/kernel/time.c b/arch/nios2/kernel/time.c index 6e2bdc9b8530..645129aaa9a0 100644 --- a/arch/nios2/kernel/time.c +++ b/arch/nios2/kernel/time.c @@ -350,7 +350,7 @@ void __init time_init(void) if (count < 2) panic("%d timer is found, it needs 2 timers in system\n", count); - clocksource_probe(); + timer_probe(); } -CLOCKSOURCE_OF_DECLARE(nios2_timer, ALTR_TIMER_COMPATIBLE, nios2_time_init); +TIMER_OF_DECLARE(nios2_timer, ALTR_TIMER_COMPATIBLE, nios2_time_init); diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index fdbcf0bf44a4..091585addb91 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -46,7 +46,6 @@ generic-y += sembuf.h generic-y += setup.h generic-y += shmbuf.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += signal.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/openrisc/include/uapi/asm/Kbuild +++ b/arch/openrisc/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f4e0755e4ae5..d0ee448700de 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -176,7 +176,7 @@ config PPC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_RCU_GUP + select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..1069f74fca47 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8d1a365b8edc..773b35d16a0b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3368,7 +3368,7 @@ void kvmppc_alloc_host_rm_ops(void) return; } - get_online_cpus(); + cpus_read_lock(); for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) @@ -3390,17 +3390,17 @@ void kvmppc_alloc_host_rm_ops(void) l_ops = (unsigned long) ops; if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { - put_online_cpus(); + cpus_read_unlock(); kfree(ops->rm_core); kfree(ops); return; } - cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE, - "ppc/kvm_book3s:prepare", - kvmppc_set_host_core, - kvmppc_clear_host_core); - put_online_cpus(); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE, + "ppc/kvm_book3s:prepare", + kvmppc_set_host_core, + kvmppc_clear_host_core); + cpus_read_unlock(); } void kvmppc_free_host_rm_ops(void) diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 3a4f85c0188f..596ae2e98040 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -349,7 +349,7 @@ static int set_subcores_per_core(int new_mode) state->master = 0; } - get_online_cpus(); + cpus_read_lock(); /* This cpu will update the globals before exiting stop machine */ this_cpu_ptr(&split_state)->master = 1; @@ -357,9 +357,10 @@ static int set_subcores_per_core(int new_mode) /* Ensure state is consistent before we call the other cpus */ mb(); - stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + stop_machine_cpuslocked(cpu_update_split_mode, &new_mode, + cpu_online_mask); - put_online_cpus(); + cpus_read_unlock(); return 0; } diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 6aa630a8d24f..262506cee4c3 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -93,7 +93,7 @@ void arch_jump_label_transform(struct jump_entry *entry, args.entry = entry; args.type = type; - stop_machine(__sm_arch_jump_label_transform, &args, NULL); + stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); } void arch_jump_label_transform_static(struct jump_entry *entry, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 3d6a99746454..6842e4501e2e 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -196,7 +196,7 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -204,7 +204,7 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_disarm_kprobe); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c3a52f9a69a0..192efdfac918 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -636,10 +636,10 @@ static void stp_work_fn(struct work_struct *work) goto out_unlock; memset(&stp_sync, 0, sizeof(stp_sync)); - get_online_cpus(); + cpus_read_lock(); atomic_set(&stp_sync.cpus, num_online_cpus() - 1); - stop_machine(stp_sync_clock, &stp_sync, cpu_online_mask); - put_online_cpus(); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); if (!check_sync_clock()) /* diff --git a/arch/score/include/uapi/asm/Kbuild b/arch/score/include/uapi/asm/Kbuild index b15bf6bc0e94..c94ee54210bc 100644 --- a/arch/score/include/uapi/asm/Kbuild +++ b/arch/score/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/score/include/uapi/asm/siginfo.h b/arch/score/include/uapi/asm/siginfo.h deleted file mode 100644 index 87ca35607a28..000000000000 --- a/arch/score/include/uapi/asm/siginfo.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCORE_SIGINFO_H -#define _ASM_SCORE_SIGINFO_H - -#include <asm-generic/siginfo.h> - -#endif /* _ASM_SCORE_SIGINFO_H */ diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig index 4e21949593cf..3554fcaa023b 100644 --- a/arch/sh/boards/Kconfig +++ b/arch/sh/boards/Kconfig @@ -10,7 +10,7 @@ config SH_DEVICE_TREE bool "Board Described by Device Tree" select OF select OF_EARLY_FLATTREE - select CLKSRC_OF + select TIMER_OF select COMMON_CLK select GENERIC_CALIBRATE_DELAY help diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c index 1fb6d5714bae..4feb7c86f4ac 100644 --- a/arch/sh/boards/of-generic.c +++ b/arch/sh/boards/of-generic.c @@ -119,7 +119,7 @@ static void __init sh_of_mem_reserve(void) static void __init sh_of_time_init(void) { pr_info("SH generic board support: scanning for clocksource devices\n"); - clocksource_probe(); + timer_probe(); } static void __init sh_of_setup(char **cmdline_p) diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index cf2a75063b53..590c91ae7541 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -29,7 +29,6 @@ generic-y += rwsem.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h -generic-y += siginfo.h generic-y += sizes.h generic-y += socket.h generic-y += statfs.h diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild index b15bf6bc0e94..b55fc2ae1e8c 100644 --- a/arch/sh/include/uapi/asm/Kbuild +++ b/arch/sh/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm + +generic-y += siginfo.h diff --git a/arch/sparc/include/asm/siginfo.h b/arch/sparc/include/asm/siginfo.h deleted file mode 100644 index 48c34c19f810..000000000000 --- a/arch/sparc/include/asm/siginfo.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __SPARC_SIGINFO_H -#define __SPARC_SIGINFO_H - -#include <uapi/asm/siginfo.h> - - -#ifdef CONFIG_COMPAT - -struct compat_siginfo; - -#endif /* CONFIG_COMPAT */ - -#endif /* !(__SPARC_SIGINFO_H) */ diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c index 07933b9e9ce0..93adde1ac166 100644 --- a/arch/sparc/kernel/jump_label.c +++ b/arch/sparc/kernel/jump_label.c @@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry, val = 0x01000000; } - get_online_cpus(); mutex_lock(&text_mutex); *insn = val; flushi(insn); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c index 07802d586988..93931a46625b 100644 --- a/arch/tile/kernel/jump_label.c +++ b/arch/tile/kernel/jump_label.c @@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e, void arch_jump_label_transform(struct jump_entry *e, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(e, type); flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits)); mutex_unlock(&text_mutex); - put_online_cpus(); } __init_or_module void arch_jump_label_transform_static(struct jump_entry *e, diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 1a70e6c0f259..94709ab41ed8 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -24,8 +24,7 @@ * has an opportunity to return -EFAULT to the user if needed. * The 64-bit routines just return a "long long" with the value, * since they are only used from kernel space and don't expect to fault. - * Support for 16-bit ops is included in the framework but we don't provide - * any (x86_64 has an atomic_inc_short(), so we might want to some day). + * Support for 16-bit ops is included in the framework but we don't provide any. * * Note that the caller is advised to issue a suitable L1 or L2 * prefetch on the address being manipulated to avoid extra stalls. diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index ea47c4bf3aa7..239eced619e6 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -44,7 +44,6 @@ generic-y += serial.h generic-y += setup.h generic-y += shmbuf.h generic-y += shmparam.h -generic-y += siginfo.h generic-y += sizes.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild index 13a97aa2285f..1c44d3b3eba0 100644 --- a/arch/unicore32/include/uapi/asm/Kbuild +++ b/arch/unicore32/include/uapi/asm/Kbuild @@ -2,3 +2,4 @@ include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h +generic-y += siginfo.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0efb4c9497bc..7c991d0e9983 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,7 @@ config X86 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select BUILDTIME_EXTABLE_SORT @@ -1082,7 +1082,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY + depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. @@ -2793,6 +2793,9 @@ config X86_DMA_REMAP bool depends on STA2X11 +config HAVE_GENERIC_GUP + def_bool y + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index e645cf0d632a..1e902f926be3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -274,8 +274,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/ drivers-$(CONFIG_FB) += arch/x86/video/ -drivers-$(CONFIG_RAS) += arch/x86/ras/ - #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 73ccf63b0f48..9dc1ce6ba3c0 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" -static unsigned long get_cmd_line_ptr(void) +unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cbf4b87f55b9..c3e869eaef0c 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c, memset((char *)gdt->address, 0x0, gdt->size); desc = (struct desc_struct *)gdt->address; - /* The first GDT is a dummy and the second is unused. */ - desc += 2; + /* The first GDT is a dummy. */ + desc++; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* __KERNEL32_CS */ + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; + } else { + /* Second entry is unused on 32-bit */ + desc++; + } + /* __KERNEL_CS */ desc->limit0 = 0xffff; desc->base0 = 0x0000; desc->base1 = 0x0000; @@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c, desc->p = 1; desc->limit = 0xf; desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; + if (IS_ENABLED(CONFIG_X86_64)) { + desc->l = 1; + desc->d = 0; + } else { + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + } desc->g = SEG_GRANULARITY_4KB; desc->base2 = 0x00; - desc++; + + /* __KERNEL_DS */ desc->limit0 = 0xffff; desc->base0 = 0x0000; desc->base1 = 0x0000; @@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c, desc->d = SEG_OP_SIZE_32BIT; desc->g = SEG_GRANULARITY_4KB; desc->base2 = 0x00; - -#ifdef CONFIG_X86_64 - /* Task segment value */ desc++; - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; -#endif /* CONFIG_X86_64 */ + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Task segment value */ + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; + } asm volatile("cli"); asm volatile ("lgdt %0" : : "m" (*gdt)); diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d2ae1f821e0c..fbf4c32d0b62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -346,6 +346,48 @@ preferred_addr: /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp +#ifdef CONFIG_X86_5LEVEL + /* Check if 5-level paging has already enabled */ + movq %cr4, %rax + testl $X86_CR4_LA57, %eax + jnz lvl5 + + /* + * At this point we are in long mode with 4-level paging enabled, + * but we want to enable 5-level paging. + * + * The problem is that we cannot do it directly. Setting LA57 in + * long mode would trigger #GP. So we need to switch off long mode + * first. + * + * NOTE: This is not going to work if bootloader put us above 4G + * limit. + * + * The first step is go into compatibility mode. + */ + + /* Clear additional page table */ + leaq lvl5_pgtable(%rbx), %rdi + xorq %rax, %rax + movq $(PAGE_SIZE/8), %rcx + rep stosq + + /* + * Setup current CR3 as the first and only entry in a new top level + * page table. + */ + movq %cr3, %rdi + leaq 0x7 (%rdi), %rax + movq %rax, lvl5_pgtable(%rbx) + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq compatible_mode(%rip), %rax + pushq %rax + lretq +lvl5: +#endif + /* Zero EFLAGS */ pushq $0 popfq @@ -429,6 +471,44 @@ relocated: jmp *%rax .code32 +#ifdef CONFIG_X86_5LEVEL +compatible_mode: + /* Setup data and stack segments */ + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %ss + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Point CR3 to 5-level paging */ + leal lvl5_pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable PAE and LA57 mode */ + movl %cr4, %eax + orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + movl %eax, %cr4 + + /* Calculate address we are running at */ + call 1f +1: popl %edi + subl $1b, %edi + + /* Prepare stack for far return to Long Mode */ + pushl $__KERNEL_CS + leal lvl5(%edi), %eax + push %eax + + /* Enable paging back */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + lret +#endif + no_longmode: /* This isn't an x86-64 CPU so hang */ 1: @@ -442,7 +522,7 @@ gdt: .word gdt_end - gdt .long gdt .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */ .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ @@ -486,3 +566,7 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 +#ifdef CONFIG_X86_5LEVEL +lvl5_pgtable: + .fill PAGE_SIZE, 1, 0 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 54c24f0a43d3..fe318b44f7b8 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -9,16 +9,42 @@ * contain the entire properly aligned running kernel image. * */ + +/* + * isspace() in linux/ctype.h is expected by next_args() to filter + * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, + * since isdigit() is implemented in both of them. Hence disable it + * here. + */ +#define BOOT_CTYPE_H + +/* + * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. + * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL + * which is meaningless and will cause compiling error in some cases. + * So do not include linux/export.h and define EXPORT_SYMBOL(sym) + * as empty. + */ +#define _LINUX_EXPORT_H +#define EXPORT_SYMBOL(sym) + #include "misc.h" #include "error.h" -#include "../boot.h" +#include "../string.h" #include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <generated/utsrelease.h> +/* Macros used by the included decompressor code below. */ +#define STATIC +#include <linux/decompress/mm.h> + +extern unsigned long get_cmd_line_ptr(void); + /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -62,6 +88,11 @@ struct mem_vector { static bool memmap_too_large; + +/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ +unsigned long long mem_limit = ULLONG_MAX; + + enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, @@ -85,49 +116,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } -/** - * _memparse - Parse a string with mem suffixes into a number - * @ptr: Where parse begins - * @retptr: (output) Optional pointer to next char after parse completes - * - * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with K, M, G, T, P, E. - */ -static unsigned long long _memparse(const char *ptr, char **retptr) +char *skip_spaces(const char *str) { - char *endptr; /* Local pointer to end of parsed string */ - - unsigned long long ret = simple_strtoull(ptr, &endptr, 0); - - switch (*endptr) { - case 'E': - case 'e': - ret <<= 10; - case 'P': - case 'p': - ret <<= 10; - case 'T': - case 't': - ret <<= 10; - case 'G': - case 'g': - ret <<= 10; - case 'M': - case 'm': - ret <<= 10; - case 'K': - case 'k': - ret <<= 10; - endptr++; - default: - break; - } - - if (retptr) - *retptr = endptr; - - return ret; + while (isspace(*str)) + ++str; + return (char *)str; } +#include "../../../../lib/ctype.c" +#include "../../../../lib/cmdline.c" static int parse_memmap(char *p, unsigned long long *start, unsigned long long *size) @@ -142,40 +138,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) return -EINVAL; oldp = p; - *size = _memparse(p, &p); + *size = memparse(p, &p); if (p == oldp) return -EINVAL; switch (*p) { - case '@': - /* Skip this region, usable */ - *start = 0; - *size = 0; - return 0; case '#': case '$': case '!': - *start = _memparse(p + 1, &p); + *start = memparse(p + 1, &p); + return 0; + case '@': + /* memmap=nn@ss specifies usable region, should be skipped */ + *size = 0; + /* Fall through */ + default: + /* + * If w/o offset, only size specified, memmap=nn[KMG] has the + * same behaviour as mem=nn[KMG]. It limits the max address + * system can use. Region above the limit should be avoided. + */ + *start = 0; return 0; } return -EINVAL; } -static void mem_avoid_memmap(void) +static void mem_avoid_memmap(char *str) { - char arg[128]; + static int i; int rc; - int i; - char *str; - /* See if we have any memmap areas */ - rc = cmdline_find_option("memmap", arg, sizeof(arg)); - if (rc <= 0) + if (i >= MAX_MEMMAP_REGIONS) return; - i = 0; - str = arg; while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; unsigned long long start, size; @@ -188,9 +185,14 @@ static void mem_avoid_memmap(void) if (rc < 0) break; str = k; - /* A usable region that should not be skipped */ - if (size == 0) + + if (start == 0) { + /* Store the specified memory limit if size > 0 */ + if (size > 0) + mem_limit = size; + continue; + } mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; @@ -202,6 +204,57 @@ static void mem_avoid_memmap(void) memmap_too_large = true; } +static int handle_mem_memmap(void) +{ + char *args = (char *)get_cmd_line_ptr(); + size_t len = strlen((char *)args); + char *tmp_cmdline; + char *param, *val; + u64 mem_size; + + if (!strstr(args, "memmap=") && !strstr(args, "mem=")) + return 0; + + tmp_cmdline = malloc(len + 1); + if (!tmp_cmdline ) + error("Failed to allocate space for tmp_cmdline"); + + memcpy(tmp_cmdline, args, len); + tmp_cmdline[len] = 0; + args = tmp_cmdline; + + /* Chew leading spaces */ + args = skip_spaces(args); + + while (*args) { + args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) { + warn("Only '--' specified in cmdline"); + free(tmp_cmdline); + return -1; + } + + if (!strcmp(param, "memmap")) { + mem_avoid_memmap(val); + } else if (!strcmp(param, "mem")) { + char *p = val; + + if (!strcmp(p, "nopentium")) + continue; + mem_size = memparse(p, &p); + if (mem_size == 0) { + free(tmp_cmdline); + return -EINVAL; + } + mem_limit = mem_size; + } + } + + free(tmp_cmdline); + return 0; +} + /* * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). * The mem_avoid array is used to store the ranges that need to be avoided @@ -323,7 +376,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - mem_avoid_memmap(); + handle_mem_memmap(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -432,7 +485,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, { struct mem_vector region, overlap; struct slot_area slot_area; - unsigned long start_orig; + unsigned long start_orig, end; + struct boot_e820_entry cur_entry; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) @@ -446,8 +500,15 @@ static void process_e820_entry(struct boot_e820_entry *entry, if (entry->addr + entry->size < minimum) return; - region.start = entry->addr; - region.size = entry->size; + /* Ignore entries above memory limit */ + end = min(entry->size + entry->addr, mem_limit); + if (entry->addr >= end) + return; + cur_entry.addr = entry->addr; + cur_entry.size = end - entry->addr; + + region.start = cur_entry.addr; + region.size = cur_entry.size; /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { @@ -461,7 +522,7 @@ static void process_e820_entry(struct boot_e820_entry *entry, region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above this e820 region? */ - if (region.start > entry->addr + entry->size) + if (region.start > cur_entry.addr + cur_entry.size) return; /* Reduce size by any delta from the original address. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 1d78f1739087..8e69df96492e 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -92,7 +92,7 @@ void initialize_identity_maps(void) * and we must append to the existing area instead of entirely * overwriting it. */ - level4p = read_cr3(); + level4p = read_cr3_pa(); if (level4p == (unsigned long)_pgtable) { debug_putstr("booted via startup_32()\n"); pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 1eb7d298b47d..15d9f74b0008 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -65,23 +65,3 @@ GLOBAL(copy_to_fs) popw %es retl ENDPROC(copy_to_fs) - -#if 0 /* Not currently used, but can be enabled as needed */ -GLOBAL(copy_from_gs) - pushw %ds - pushw %gs - popw %ds - calll memcpy - popw %ds - retl -ENDPROC(copy_from_gs) - -GLOBAL(copy_to_gs) - pushw %es - pushw %gs - popw %es - calll memcpy - popw %es - retl -ENDPROC(copy_to_gs) -#endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 5457b02fc050..630e3664906b 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } +long simple_strtol(const char *cp, char **endp, unsigned int base) +{ + if (*cp == '-') + return -simple_strtoull(cp + 1, endp, base); + + return simple_strtoull(cp, endp, base); +} + /** * strlen - Find the length of a string * @s: The string to be sized diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4a4c0834f965..a9a8027a6c0e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -265,7 +265,8 @@ return_from_SYSCALL_64: * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. * - * Change top 16 bits to be the sign-extension of 47th bit + * Change top bits to match most significant bit (47th or 56th bit + * depending on paging mode) in the address. */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 580b60f5ac83..ff1ea2fb9705 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) return ret; } +static struct attribute_group x86_pmu_attr_group; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void) x86_pmu_events_group.attrs = tmp; } + if (x86_pmu.attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs); + if (!WARN_ON(!tmp)) + x86_pmu_attr_group.attrs = tmp; + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -2101,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->active_mm) - load_mm_cr4(current->active_mm); + load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } static void x86_pmu_event_mapped(struct perf_event *event) @@ -2224,7 +2233,6 @@ void perf_check_microcode(void) if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } -EXPORT_SYMBOL_GPL(perf_check_microcode); static struct pmu pmu = { .pmu_enable = x86_pmu_enable, @@ -2255,7 +2263,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - struct cyc2ns_data *data; + struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; @@ -2267,17 +2275,17 @@ void arch_perf_update_userpage(struct perf_event *event, if (!using_native_sched_clock() || !sched_clock_stable()) return; - data = cyc2ns_read_begin(); + cyc2ns_read_begin(&data); - offset = data->cyc2ns_offset + __sched_clock_offset; + offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; - userpg->time_mult = data->cyc2ns_mul; - userpg->time_shift = data->cyc2ns_shift; + userpg->time_mult = data.cyc2ns_mul; + userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* @@ -2289,7 +2297,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_zero = offset; } - cyc2ns_read_end(data); + cyc2ns_read_end(); } void @@ -2334,7 +2342,7 @@ static unsigned long get_segment_base(unsigned int segment) /* IRQs are off, so this synchronizes with smp_store_release */ ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->size) + if (!ldt || idx > ldt->nr_entries) return 0; desc = &ldt->entries[idx]; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 110ce8238466..aa62437d1aa1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3160,6 +3160,19 @@ err: return -ENOMEM; } +static void flip_smm_bit(void *data) +{ + unsigned long set = *(unsigned long *)data; + + if (set > 0) { + msr_set_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } else { + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } +} + static void intel_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (!cpuc->shared_regs) return; @@ -3410,12 +3425,10 @@ static void intel_snb_check_microcode(void) int pebs_broken = 0; int cpu; - get_online_cpus(); for_each_online_cpu(cpu) { if ((pebs_broken = intel_snb_pebs_broken(cpu))) break; } - put_online_cpus(); if (pebs_broken == x86_pmu.pebs_broken) return; @@ -3488,7 +3501,9 @@ static bool check_msr(unsigned long msr, u64 mask) static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; + cpus_read_lock(); intel_snb_check_microcode(); + cpus_read_unlock(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = { NULL }; +static ssize_t freeze_on_smi_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi); +} + +static DEFINE_MUTEX(freeze_on_smi_mutex); + +static ssize_t freeze_on_smi_store(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val > 1) + return -EINVAL; + + mutex_lock(&freeze_on_smi_mutex); + + if (x86_pmu.attr_freeze_on_smi == val) + goto done; + + x86_pmu.attr_freeze_on_smi = val; + + get_online_cpus(); + on_each_cpu(flip_smm_bit, &val, 1); + put_online_cpus(); +done: + mutex_unlock(&freeze_on_smi_mutex); + + return count; +} + +static DEVICE_ATTR_RW(freeze_on_smi); + +static struct attribute *intel_pmu_attrs[] = { + &dev_attr_freeze_on_smi.attr, + NULL, +}; + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void) x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + + x86_pmu.attrs = intel_pmu_attrs; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: @@ -4112,13 +4175,12 @@ static __init int fixup_ht_bug(void) lockup_detector_resume(); - get_online_cpus(); + cpus_read_lock(); - for_each_online_cpu(c) { + for_each_online_cpu(c) free_excl_cntrs(c); - } - put_online_cpus(); + cpus_read_unlock(); pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); return 0; } diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8c00dc09a5d2..2521f771f2f5 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -1682,7 +1682,7 @@ static int __init intel_cqm_init(void) * * Also, check that the scales match on all cpus. */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1746,14 +1746,14 @@ static int __init intel_cqm_init(void) * Setup the hot cpu notifier once we are sure cqm * is enabled to avoid notifier leak. */ - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); - + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, + "perf/x86/cqm:starting", + intel_cqm_cpu_starting, NULL); + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, + "perf/x86/cqm:online", + NULL, intel_cqm_cpu_exit); out: - put_online_cpus(); + cpus_read_unlock(); if (ret) { kfree(str); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be3d36254040..53728eea1bed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -562,6 +562,9 @@ struct x86_pmu { ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; + unsigned long attr_freeze_on_smi; + struct attribute **attrs; + /* * CPU Hotplug hooks */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00c88a01301d..da181ad1d5f8 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,6 +3,7 @@ #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/refcount.h> struct amd_nb_bus_dev_range { u8 bus; @@ -55,7 +56,7 @@ struct threshold_bank { struct threshold_block *blocks; /* initialized to the number of CPUs on the node sharing this bank */ - atomic_t cpus; + refcount_t cpus; }; struct amd_northbridge { diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index caa5798c92f4..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); + + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,24 +260,13 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); - return c; -} -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static __always_inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; + return c; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bcefe6b4..d2ff779f347e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -74,7 +74,7 @@ struct efi_scratch { __kernel_fpu_begin(); \ \ if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = read_cr3(); \ + efi_scratch.prev_cr3 = __read_cr3(); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 59405a248fc2..9b76cd331990 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,8 +22,8 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - unsigned int irq_tlb_count; #endif + unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3f9a3d2a5209..181264989db5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -285,10 +285,6 @@ int mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); -extern void register_mce_write_callback(ssize_t (*)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)); - /* Disable CMCI/polling for MCA bank claimed by firmware */ extern void mce_disable_bank(int bank); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index f9813b6d8b80..79b647a7ebd0 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -37,12 +37,6 @@ typedef struct { #endif } mm_context_t; -#ifdef CONFIG_SMP void leave_mm(int cpu); -#else -static inline void leave_mm(int cpu) -{ -} -#endif #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 68b329d77b3a..ecfcb6643c9b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -47,7 +47,7 @@ struct ldt_struct { * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; - unsigned int size; + unsigned int nr_entries; }; /* @@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm) */ if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->size); + set_ldt(ldt->entries, ldt->nr_entries); else clear_LDT(); #else clear_LDT(); #endif +} + +static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) +{ +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT if either the old or new mm had an LDT. + * + * An mm will never go from having an LDT to not having an LDT. Two + * mms never share an LDT, so we don't gain anything by checking to + * see whether the LDT changed. There's also no guarantee that + * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, + * then prev->context.ldt will also be non-NULL. + * + * If we really cared, we could optimize the case where prev == next + * and we're exiting lazy mode. Most of the time, if this happens, + * we don't actually need to reload LDTR, but modify_ldt() is mostly + * used by legacy code and emulators where we don't need this level of + * performance. + * + * This uses | instead of || because it generates better code. + */ + if (unlikely((unsigned long)prev->context.ldt | + (unsigned long)next->context.ldt)) + load_mm_ldt(next); +#endif DEBUG_LOCKS_WARN_ON(preemptible()); } static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { -#ifdef CONFIG_SMP if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); -#endif } static inline int init_new_context(struct task_struct *tsk, @@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) } #endif -static inline bool __pkru_allows_pkey(u16 pkey, bool write) -{ - u32 pkru = read_pkru(); - - if (!__pkru_allows_read(pkru, pkey)) - return false; - if (write && !__pkru_allows_write(pkru, pkey)) - return false; - - return true; -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } + +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +static inline unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || !in_atomic()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8638e2419cc..d406894cd9a2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -137,6 +137,8 @@ #define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 +#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) #define MSR_PEBS_FRONTEND 0x000003f7 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 55fa56fe4e45..a63e77f8eb41 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr2, x); } -static inline unsigned long read_cr3(void) +static inline unsigned long __read_cr3(void) { return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); } @@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr) } static inline void flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end) + const struct flush_tlb_info *info) { - PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); + PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7465d6fe336f..cb976bab6299 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -51,6 +51,7 @@ struct mm_struct; struct desc_struct; struct task_struct; struct cpumask; +struct flush_tlb_info; /* * Wrapper type for pointers to code which uses the non-standard @@ -223,9 +224,7 @@ struct pv_mmu_ops { void (*flush_tlb_kernel)(void); void (*flush_tlb_single)(unsigned long addr); void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + const struct flush_tlb_info *info); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 50d35e3185f5..c8821bab938f 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) +#define gup_get_pte gup_get_pte +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a PTE will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' iff pte_high + * sees 'h'. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have gotten rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. Because get_user_pages_fast() only operates on present ptes + * we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f5af95a0c6b8..77037b6f1caa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud) return 0; } #endif + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry; static inline void __meminit init_trampoline_default(void) { /* Default trampoline pgd value */ - trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; } # ifdef CONFIG_RANDOMIZE_MEMORY void __meminit init_trampoline(void); @@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) #endif } +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + +/* + * 'pteval' can come from a PTE, PMD or PUD. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 3 types. + */ +static inline bool __pte_access_permitted(unsigned long pteval, bool write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + return __pkru_allows_pkey(pte_flags_pkey(pteval), write); +} + +#define pte_access_permitted pte_access_permitted +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + return __pte_access_permitted(pte_val(pte), write); +} + +#define pmd_access_permitted pmd_access_permitted +static inline bool pmd_access_permitted(pmd_t pmd, bool write) +{ + return __pte_access_permitted(pmd_val(pmd), write); +} + +#define pud_access_permitted pud_access_permitted +static inline bool pud_access_permitted(pud_t pud, bool write) +{ + return __pte_access_permitted(pud_val(pud), write); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9991224f6238..2160c1fee920 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,15 +14,17 @@ #include <linux/bitops.h> #include <linux/threads.h> +extern p4d_t level4_kernel_pgt[512]; +extern p4d_t level4_ident_pgt[512]; extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; extern pte_t level1_fixmap_pgt[512]; -extern pgd_t init_level4_pgt[]; +extern pgd_t init_top_pgt[]; -#define swapper_pg_dir init_level4_pgt +#define swapper_pg_dir init_top_pgt extern void paging_init(void); @@ -227,6 +229,20 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); -#endif /* !__ASSEMBLY__ */ +#define gup_fast_permitted gup_fast_permitted +static inline bool gup_fast_permitted(unsigned long start, int nr_pages, + int write) +{ + unsigned long len, end; + + len = (unsigned long)nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + if (end >> __VIRTUAL_MASK_SHIFT) + return false; + return true; +} +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 39fb618e2211..79aa2f98398d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -8,4 +8,40 @@ #else #define X86_VM_MASK 0 /* No VM86 support */ #endif + +/* + * CR3's layout varies depending on several things. + * + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID. + * If PAE is enabled, then CR3[11:5] is part of the PDPT address + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored. + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and + * CR3[2:0] and CR3[11:5] are ignored. + * + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD. + * + * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be + * written as 1 to prevent the write to CR3 from flushing the TLB. + * + * On systems with SME, one bit (in a variable position!) is stolen to indicate + * that the top-level paging structure is encrypted. + * + * All of the remaining bits indicate the physical address of the top-level + * paging structure. + * + * CR3_ADDR_MASK is the mask used by read_cr3_pa(). + */ +#ifdef CONFIG_X86_64 +/* Mask off the address space ID bits. */ +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull +#define CR3_PCID_MASK 0xFFFull +#else +/* + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save + * a tiny bit of code size by setting all the bits. + */ +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..f3b1b27f1c0a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -231,6 +231,14 @@ native_cpuid_reg(ebx) native_cpuid_reg(ecx) native_cpuid_reg(edx) +/* + * Friendlier CR3 helpers. + */ +static inline unsigned long read_cr3_pa(void) +{ + return __read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); @@ -901,8 +909,13 @@ static inline int mpx_disable_management(void) } #endif /* CONFIG_X86_INTEL_MPX */ +#ifdef CONFIG_CPU_SUP_AMD extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); +#else +static inline u16 amd_get_nb_id(int cpu) { return 0; } +static inline u32 amd_get_nodes_per_socket(void) { return 0; } +#endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ac1d5da14734..e4585a393965 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -44,7 +44,6 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); -extern void setup_default_timer_irq(void); #ifdef CONFIG_X86_INTEL_MID extern void x86_intel_mid_early_setup(void); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 12af3e35edfa..9efaabf5b54b 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); } -static inline unsigned long native_read_cr3(void) +static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); @@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x) native_write_cr2(x); } -static inline unsigned long read_cr3(void) +/* + * Careful! CR3 contains more than just an address. You probably want + * read_cr3_pa() instead. + */ +static inline unsigned long __read_cr3(void) { - return native_read_cr3(); + return __native_read_cr3(); } static inline void write_cr3(unsigned long x) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 27e9f9d769b8..2016962103df 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -29,11 +29,9 @@ struct cyc2ns_data { u32 cyc2ns_mul; u32 cyc2ns_shift; u64 cyc2ns_offset; - u32 __count; - /* u32 hole */ -}; /* 24 bytes -- do not grow */ +}; /* 16 bytes */ -extern struct cyc2ns_data *cyc2ns_read_begin(void); -extern void cyc2ns_read_end(struct cyc2ns_data *); +extern void cyc2ns_read_begin(struct cyc2ns_data *); +extern void cyc2ns_read_end(void); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h new file mode 100644 index 000000000000..f4a6ff352a0e --- /dev/null +++ b/arch/x86/include/asm/tlbbatch.h @@ -0,0 +1,14 @@ +#ifndef _ARCH_X86_TLBBATCH_H +#define _ARCH_X86_TLBBATCH_H + +#include <linux/cpumask.h> + +struct arch_tlbflush_unmap_batch { + /* + * Each bit set is a CPU that potentially has a TLB entry for one of + * the PFNs being flushed.. + */ + struct cpumask cpumask; +}; + +#endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6ed9ea469b48..50ea3482e1d1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,7 @@ #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> +#include <asm/smp.h> static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) @@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void) #endif struct tlb_state { -#ifdef CONFIG_SMP - struct mm_struct *active_mm; + /* + * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts + * are on. This means that it may not match current->active_mm, + * which will contain the previous user mm when we're in lazy TLB + * mode even if we've already switched back to swapper_pg_dir. + */ + struct mm_struct *loaded_mm; int state; -#endif /* * Access to this CR4 shadow and to H/W CR4 is protected by @@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - native_write_cr3(native_read_cr3()); + native_write_cr3(__native_read_cr3()); preempt_enable(); } @@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr) * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus + * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ - -#ifndef CONFIG_SMP - -/* "_up" is for UniProcessor. - * - * This is a helper for other header functions. *Not* intended to be called - * directly. All global TLB flushes need to either call this, or to bump the - * vm statistics themselves. - */ -static inline void __flush_tlb_up(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); -} - -static inline void flush_tlb_all(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb_all(); -} - -static inline void local_flush_tlb(void) -{ - __flush_tlb_up(); -} - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_mm_range(struct mm_struct *mm, - unsigned long start, unsigned long end, unsigned long vmflag) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ -} - -static inline void reset_lazy_tlbstate(void) -{ -} - -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - -#else /* SMP */ - -#include <asm/smp.h> +struct flush_tlb_info { + struct mm_struct *mm; + unsigned long start; + unsigned long end; +}; #define local_flush_tlb() __flush_tlb() @@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) +{ + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); +} + void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, unsigned long end); + const struct flush_tlb_info *info); #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -static inline void reset_lazy_tlbstate(void) +static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) { - this_cpu_write(cpu_tlbstate.state, 0); - this_cpu_write(cpu_tlbstate.active_mm, &init_mm); + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } -#endif /* SMP */ +extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, mm, start, end) \ - native_flush_tlb_others(mask, mm, start, end) +#define flush_tlb_others(mask, info) \ + native_flush_tlb_others(mask, info) #endif #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6686820feae9..b5a32231abd8 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_UV_UV_H #define _ASM_X86_UV_UV_H +#include <asm/tlbflush.h> + enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; struct cpumask; @@ -15,10 +17,7 @@ extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - unsigned int cpu); + const struct flush_tlb_info *info); #else /* X86_UV */ @@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * -uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long start, unsigned long end, unsigned int cpu) +uv_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { return cpumask; } #endif /* X86_UV */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 567de50a4c2a..185f3d10c194 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -104,6 +104,8 @@ #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ +#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ #define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) #define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2d75faf743f2..3b215ff36016 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -54,6 +54,8 @@ #include <asm/mce.h> #include <asm/tsc.h> #include <asm/hypervisor.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> unsigned int num_processors; @@ -545,6 +547,81 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } + +#define DEADLINE_MODEL_MATCH_REV(model, rev) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } + +static u32 hsx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x3a; /* EP */ + case 0x04: return 0x0f; /* EX */ + } + + return ~0U; +} + +static u32 bdx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x00000011; + case 0x03: return 0x0700000e; + case 0x04: return 0x0f00000c; + case 0x05: return 0x0e000003; + } + + return ~0U; +} + +static const struct x86_cpu_id deadline_match[] = { + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + + {}, +}; + +static void apic_check_deadline_errata(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + u32 rev; + + if (!m) + return; + + /* + * Function pointers will have the MSB set due to address layout, + * immediate revisions will not. + */ + if ((long)m->driver_data < 0) + rev = ((u32 (*)(void))(m->driver_data))(); + else + rev = (u32)m->driver_data; + + if (boot_cpu_data.microcode >= rev) + return; + + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " + "please update microcode to version: 0x%x (or later)\n", rev); +} + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -563,6 +640,7 @@ static void setup_APIC_timer(void) levt->cpumask = cpumask_of(smp_processor_id()); if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->name = "lapic-deadline"; levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; @@ -1779,6 +1857,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; + apic_check_deadline_errata(); + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index ae50d3454d78..81ff48905623 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -150,7 +150,7 @@ static const struct irq_domain_ops htirq_domain_ops = { .deactivate = htirq_domain_deactivate, }; -void arch_init_htirq_domain(struct irq_domain *parent) +void __init arch_init_htirq_domain(struct irq_domain *parent) { if (disable_apic) return; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 347bb9f65737..247880fc29f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); static struct irq_chip ioapic_chip, ioapic_ir_chip; -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for_each_ioapic_pin(apic, pin) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) - return irq_trigger(idx); - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c61aec7e65f4..4c5d1882bec5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -136,7 +136,7 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void arch_init_msi_domain(struct irq_domain *parent) +void __init arch_init_msi_domain(struct irq_domain *parent) { if (disable_apic) return; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f3557a1eb562..e66d8e48e456 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -405,7 +405,7 @@ int __init arch_probe_nr_irqs(void) } #ifdef CONFIG_X86_IO_APIC -static void init_legacy_irqs(void) +static void __init init_legacy_irqs(void) { int i, node = cpu_to_node(0); struct apic_chip_data *data; @@ -424,7 +424,7 @@ static void init_legacy_irqs(void) } } #else -static void init_legacy_irqs(void) { } +static inline void init_legacy_irqs(void) { } #endif int __init arch_early_irq_init(void) diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 9c632cb88546..10cec43aac38 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -17,6 +17,8 @@ #include "mce-internal.h" +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + static DEFINE_MUTEX(mce_chrdev_read_mutex); static char mce_helper[128]; @@ -345,24 +347,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) +void mce_unregister_injector_chain(struct notifier_block *nb) { - mce_write = fn; + blocking_notifier_chain_unregister(&mce_injector_chain, nb); } -EXPORT_SYMBOL_GPL(register_mce_write_callback); +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; } static const struct file_operations mce_chrdev_ops = { @@ -388,9 +415,15 @@ static __init int dev_mcelog_init_device(void) /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; } + mce_register_decode_chain(&dev_mcelog_nb); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 99165b206df3..231ad23b24a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -10,23 +10,105 @@ * Authors: * Andi Kleen * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de> + * Advanced Micro Devices Inc. */ -#include <linux/uaccess.h> -#include <linux/module.h> -#include <linux/timer.h> + +#include <linux/cpu.h> +#include <linux/debugfs.h> #include <linux/kernel.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/preempt.h> -#include <linux/smp.h> +#include <linux/module.h> #include <linux/notifier.h> -#include <linux/kdebug.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/gfp.h> -#include <asm/mce.h> +#include <linux/pci.h> +#include <linux/uaccess.h> + +#include <asm/amd_nb.h> #include <asm/apic.h> +#include <asm/irq_vectors.h> +#include <asm/mce.h> #include <asm/nmi.h> +#include <asm/smp.h> + +#include "mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; +} /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -143,7 +225,7 @@ static int raise_local(void) return ret; } -static void raise_mce(struct mce *m) +static void __maybe_unused raise_mce(struct mce *m) { int context = MCJ_CTX(m->inject_flags); @@ -198,55 +280,454 @@ static void raise_mce(struct mce *m) } } -/* Error injection interface */ -static ssize_t mce_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) { - struct mce m; + struct mce *m = (struct mce *)data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* - * There are some cases where real MSR reads could slip - * through. - */ - if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) - return -EIO; + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); - if ((unsigned long)usize > sizeof(struct mce)) - usize = sizeof(struct mce); - if (copy_from_user(&m, ubuf, usize)) + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; - if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + rdtscll(i_mce.tsc); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; /* - * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC */ - schedule_timeout(2); + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } - mutex_lock(&mce_inject_mutex); - raise_mce(&m); - mutex_unlock(&mce_inject_mutex); - return usize; + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init debugfs_init(void) +{ + unsigned int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (i-- > 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENODEV; } -static int inject_init(void) +static int __init inject_init(void) { + int err; + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + + err = debugfs_init(); + if (err) { + free_cpumask_var(mce_inject_cpumask); + return err; + } + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + pr_info("Machine check injector initialized\n"); - register_mce_write_callback(mce_write); - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, - "mce_notify"); + return 0; } +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + module_init(inject_init); -/* - * Cannot tolerate unloading currently because we cannot - * guarantee all openers of mce_chrdev will get a reference to us. - */ +module_exit(inject_exit); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 654ad0668d72..098530a93bb7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -100,7 +100,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) extern struct device_attribute dev_attr_trigger; #ifdef CONFIG_X86_MCELOG_LEGACY -extern void mce_work_trigger(void); +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); #else static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5cfbaeb6529a..b58b77808ce4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1550,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 < 17 && cfg->bootlog < 0) { + if (c->x86 < 0x11 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1832,7 +1832,8 @@ void mce_disable_bank(int bank) * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable memcpy_mcsafe() @@ -1912,12 +1913,13 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide. * Disabling them for just a single offlined CPU is bad, since it will * inhibit reporting for all shared resources on the socket like the * last level cache (LLC), the integrated memory controller (iMC), etc. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 6e4a047e4b68..9e314bcf67cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void get_smca_bank_info(unsigned int bank) +static void smca_configure(unsigned int bank, unsigned int cpu) { - unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + unsigned int i, hwid_mcatype; struct smca_hwid *s_hwid; - u32 high, instance_id; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank) smca_get_name(s_hwid->bank_type)); smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = instance_id; + smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; break; } @@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high, smca_addr; + u32 smca_low, smca_high; struct threshold_block b; int new; @@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, goto set_offset; } - smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - smca_high |= BIT(0); - - /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) - smca_high |= BIT(5); - - wrmsr(smca_addr, smca_low, smca_high); - } - /* Gather LVT offset for thresholding: */ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) goto out; @@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) - get_smca_bank_info(bank); + smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); @@ -755,37 +741,19 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); -static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; + m.misc = misc; m.bank = bank; m.tsc = rdtsc(); - if (threshold_err) - m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:<lsb>] where lsb is the least significant @@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -832,87 +798,126 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) exiting_ack_irq(); } -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - unsigned int bank; - u32 msr_status; - u64 status; + u64 status, addr = 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; - rdmsrl(msr_status, status); + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; + __log_error(bank, status, addr, misc); - __log_error(bank, true, false, 0); - break; - } + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; } /* - * APIC Interrupt Handler + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; -/* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. - */ + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); -static void amd_threshold_interrupt(void) + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) { - u32 low = 0, high = 0, address = 0; - unsigned int bank, block, cpu = smp_processor_id(); - struct thresh_restart tr; + unsigned int bank; - /* assume first bank caused it */ - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); - if (!address) - break; + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} - if (rdmsr_safe(address, &low, &high)) - break; +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; + if (!block) + return; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } - } - return; + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; -log: - __log_error(bank, false, true, ((u64)high << 32) | low); + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + tr.b = block; threshold_restart_bank(&tr); } /* + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. + */ +static void amd_threshold_interrupt(void) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + unsigned int bank, cpu = smp_processor_id(); + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + + first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + if (!first_block) + continue; + + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); + } +} + +/* * Sysfs Interface */ @@ -1202,7 +1207,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; per_cpu(threshold_banks, cpu)[bank] = b; - atomic_inc(&b->cpus); + refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1225,7 +1230,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; if (is_shared_bank(bank)) { - atomic_set(&b->cpus, 1); + refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ if (nb) { @@ -1289,7 +1294,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) goto free_out; if (is_shared_bank(bank)) { - if (!atomic_dec_and_test(&b->cpus)) { + if (!refcount_dec_and_test(&b->cpus)) { __threshold_remove_blocks(b); per_cpu(threshold_banks, cpu)[bank] = NULL; return; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index e53d3c909840..9cb98ee103db 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -290,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) return (struct cpio_data){ NULL, 0, "" }; if (initrd_start) start = initrd_start; + } else { + /* + * The picture with physical addresses is a bit different: we + * need to get the *physical* address to which the ramdisk was + * relocated, i.e., relocated_ramdisk (not initrd_start) and + * since we're running from physical addresses, we need to access + * relocated_ramdisk through its *physical* address too. + */ + u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); + if (*rr) + start = *rr; } return find_cpio_data(path, (void *)start, size, NULL); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f522415bf9e5..d525a0bd7d28 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) static void save_microcode_patch(void *data, unsigned int size) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - struct ucode_patch *iter, *tmp, *p; + struct ucode_patch *iter, *tmp, *p = NULL; bool prev_found = false; unsigned int sig, pf; @@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size) else list_add_tail(&p->plist, µcode_cache); } + + /* + * Save for early loading. On 32-bit, that needs to be a physical + * address as the APs are running from physical addresses, before + * paging has been enabled. + */ + if (p) { + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; + } } static int microcode_sanity_check(void *mc, int print_err) @@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void) struct ucode_cpu_info uci; struct cpio_data cp; + /* + * initrd is going away, clear patch ptr. We will scan the microcode one + * last time before jettisoning and save a patch, if found. Then we will + * update that pointer too, with a stable patch address to use when + * resuming the cores. + */ + intel_ucode_patch = NULL; + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path, false); @@ -619,9 +639,6 @@ int __init save_microcode_in_initrd_intel(void) show_saved_mc(); - /* initrd is going away, clear patch ptr. */ - intel_ucode_patch = NULL; - return 0; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 2bce84d91c2b..c5bb63be4ba1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -807,10 +807,8 @@ void mtrr_save_state(void) if (!mtrr_enabled()) return; - get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); - put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 8e598a1ad986..6b91e2eb8d3f 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -125,7 +125,7 @@ void __init init_espfix_bsp(void) p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); p4d_populate(&init_mm, p4d, espfix_pud_page); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 43b7002f44fb..46c3c73e7f43 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -33,17 +33,120 @@ /* * Manage page tables very early on. */ -extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt = 2; +static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#define __head __section(.head.text) + +static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +{ + return ptr - (void *)_text + (void *)physaddr; +} + +void __head __startup_64(unsigned long physaddr) +{ + unsigned long load_delta, *p; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + int i; + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_PAGE_MASK) + for (;;); + + /* Fixup the physical addresses in the page table */ + + pgd = fixup_pointer(&early_top_pgt, physaddr); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d[511] += load_delta; + } + + pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud[510] += load_delta; + pud[511] += load_delta; + + pmd = fixup_pointer(level2_fixmap_pgt, physaddr); + pmd[506] += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + + i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; + p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + } + + i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; + pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; + pmd[idx] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + */ + + pmd = fixup_pointer(level2_kernel_pgt, physaddr); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + } + + /* Fixup phys_base */ + p = fixup_pointer(&phys_base, physaddr); + *p += load_delta; +} + /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); + memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); + write_cr3(__pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ @@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; + p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) return -1; again: - pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd_p = &early_top_pgt[pgd_index(address)].pgd; pgd = *pgd_p; /* @@ -67,8 +171,25 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (pgd) - pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = pgd_p; + else if (pgd) + p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + p4d_p += p4d_index(address); + p4d = *p4d_p; + + if (p4d) + pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); @@ -77,7 +198,7 @@ again: pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); pud = *pud_p; @@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); - clear_page(init_level4_pgt); + clear_page(init_top_pgt); kasan_early_init(); @@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - /* set init_level4_pgt kernel high mapping*/ - init_level4_pgt[511] = early_level4_pgt[511]; + /* set init_top_pgt kernel high mapping*/ + init_top_pgt[511] = early_top_pgt[511]; x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ac9d327d2e42..6225550883df 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -37,10 +37,11 @@ * */ +#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) -L4_START_KERNEL = pgd_index(__START_KERNEL_map) +PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) +PGD_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -72,101 +73,12 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu - /* - * Compute the delta between the address I am compiled to run at and the - * address I am actually running at. - */ - leaq _text(%rip), %rbp - subq $_text - __START_KERNEL_map, %rbp - - /* Is the address not 2M aligned? */ - testl $~PMD_PAGE_MASK, %ebp - jnz bad_address - - /* - * Is the address too large? - */ - leaq _text(%rip), %rax - shrq $MAX_PHYSMEM_BITS, %rax - jnz bad_address - - /* - * Fixup the physical addresses in the page table - */ - addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) - - addq %rbp, level3_kernel_pgt + (510*8)(%rip) - addq %rbp, level3_kernel_pgt + (511*8)(%rip) - - addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ leaq _text(%rip), %rdi - leaq early_level4_pgt(%rip), %rbx - - movq %rdi, %rax - shrq $PGDIR_SHIFT, %rax - - leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx - movq %rdx, 0(%rbx,%rax,8) - movq %rdx, 8(%rbx,%rax,8) - - addq $PAGE_SIZE, %rdx - movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - incl %eax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - - addq $PAGE_SIZE * 2, %rbx - movq %rdi, %rax - shrq $PMD_SHIFT, %rdi - addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax - leaq (_end - 1)(%rip), %rcx - shrq $PMD_SHIFT, %rcx - subq %rdi, %rcx - incl %ecx - -1: - andq $(PTRS_PER_PMD - 1), %rdi - movq %rax, (%rbx,%rdi,8) - incq %rdi - addq $PMD_SIZE, %rax - decl %ecx - jnz 1b - - test %rbp, %rbp - jz .Lskip_fixup + pushq %rsi + call __startup_64 + popq %rsi - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - */ - leaq level2_kernel_pgt(%rip), %rdi - leaq PAGE_SIZE(%rdi), %r8 - /* See if it is a valid page table entry */ -1: testb $_PAGE_PRESENT, 0(%rdi) - jz 2f - addq %rbp, 0(%rdi) - /* Go to the next page */ -2: addq $8, %rdi - cmp %r8, %rdi - jne 1b - - /* Fixup phys_base */ - addq %rbp, phys_base(%rip) - -.Lskip_fixup: - movq $(early_level4_pgt - __START_KERNEL_map), %rax + movq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -186,14 +98,17 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq $(init_top_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ + /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx +#ifdef CONFIG_X86_5LEVEL + orl $X86_CR4_LA57, %ecx +#endif movq %rcx, %cr4 - /* Setup early boot stage 4 level pagetables. */ + /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax movq %rax, %cr3 @@ -417,9 +332,13 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PAGE(early_top_pgt) .fill 511,8,0 +#ifdef CONFIG_X86_5LEVEL + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#endif NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) +NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE @@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #endif +#ifdef CONFIG_X86_5LEVEL +NEXT_PAGE(level4_kernel_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#endif + NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 89ff7af2de50..16f82a3aaec7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -285,7 +285,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index); clockevents_config_and_register(&hpet_clockevent, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c37bd0f39c70..ab4f491da2a9 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d4a15831ac58..a870910c8565 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -22,24 +22,25 @@ #include <asm/syscalls.h> /* context.lock is held for us, so we don't need any locking. */ -static void flush_ldt(void *current_mm) +static void flush_ldt(void *__mm) { + struct mm_struct *mm = __mm; mm_context_t *pc; - if (current->active_mm != current_mm) + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = ¤t->active_mm->context; - set_ldt(pc->ldt->entries, pc->ldt->size); + pc = &mm->context; + set_ldt(pc->ldt->entries, pc->ldt->nr_entries); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(unsigned int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; - if (size > LDT_ENTRIES) + if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); @@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); - alloc_size = size * LDT_ENTRY_SIZE; + alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no @@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; } - new_ldt->size = size; + new_ldt->nr_entries = num_entries; return new_ldt; } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { - paravirt_alloc_ldt(ldt->entries, ldt->size); + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } /* context.lock is held */ @@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; - paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + paravirt_free_ldt(ldt->entries, ldt->nr_entries); + if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); @@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) goto out_unlock; } - new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, - new_ldt->size * LDT_ENTRY_SIZE); + new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); mm->context.ldt = new_ldt; @@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm) static int read_ldt(void __user *ptr, unsigned long bytecount) { - int retval; - unsigned long size; struct mm_struct *mm = current->mm; + unsigned long entries_size; + int retval; mutex_lock(&mm->context.lock); @@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; - size = mm->context.ldt->size * LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; + entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; + if (entries_size > bytecount) + entries_size = bytecount; - if (copy_to_user(ptr, mm->context.ldt->entries, size)) { + if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } - if (size != bytecount) { + if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ - if (clear_user(ptr + size, bytecount - size)) { + if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } @@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; - unsigned int oldsize, newsize; + unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; @@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); - old_ldt = mm->context.ldt; - oldsize = old_ldt ? old_ldt->size : 0; - newsize = max(ldt_info.entry_number + 1, oldsize); + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; + new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; - new_ldt = alloc_ldt_struct(newsize); + new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) - memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); + memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); + new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6f5ca4ebe6e5..cb0a30473c23 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); - VMCOREINFO_SYMBOL(init_level4_pgt); + VMCOREINFO_SYMBOL(init_top_pgt); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 6d9582ec0324..d27f8d84c4ff 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* Don't wait longer than a second */ timeout = USEC_PER_SEC; - while (!cpumask_empty(mask) && timeout--) + while (!cpumask_empty(mask) && --timeout) udelay(1); /* What happens if we timeout, do we still unregister?? */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3586996fc50d..bc0a849589bb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, + .read_cr3 = __native_read_cr3, .write_cr3 = native_write_cr3, .flush_tlb_user = native_flush_tlb, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ffeae818aa7a..c6d6dc5f8bb2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b6840bf3940b..c3169be4c596 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", @@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task) pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt->entries, - dead_task->mm->context.ldt->size); + dead_task->mm->context.ldt->nr_entries); BUG(); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f81823695014..65622f07e633 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f04479a8f74f..b474c8de7fba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid) if (cpu == 1) printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state == SYSTEM_BOOTING) { + if (system_state < SYSTEM_RUNNING) { if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); @@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu) void play_dead_common(void) { idle_task_exit(); - reset_lazy_tlbstate(); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index f07f83b3611b..5f25cfbd952e 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re mutex_lock(&child->mm->context.lock); if (unlikely(!child->mm->context.ldt || - seg >= child->mm->context.ldt->size)) + seg >= child->mm->context.ldt->nr_entries)) addr = -1L; /* bogus selector, access would fault */ else { desc = &child->mm->context.ldt->entries[seg]; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d39c09119db6..e0754cdbad37 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -66,7 +66,7 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init setup_default_timer_irq(void) +static void __init setup_default_timer_irq(void) { if (!nr_legacy_irqs()) return; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 714dfba6a1e7..5270fc0c2df6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; struct clocksource *art_related_clocksource; -/* - * Use a ring-buffer like data structure, where a writer advances the head by - * writing a new data entry and a reader advances the tail when it observes a - * new entry. - * - * Writers are made to wait on readers until there's space to write a new - * entry. - * - * This means that we can always use an {offset, mul} pair to compute a ns - * value that is 'roughly' in the right direction, even if we're writing a new - * {offset, mul} pair during the clock read. - * - * The down-side is that we can no longer guarantee strict monotonicity anymore - * (assuming the TSC was that to begin with), because while we compute the - * intersection point of the two clock slopes and make sure the time is - * continuous at the point of switching; we can no longer guarantee a reader is - * strictly before or after the switch point. - * - * It does mean a reader no longer needs to disable IRQs in order to avoid - * CPU-Freq updates messing with his times, and similarly an NMI reader will - * no longer run the risk of hitting half-written state. - */ - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ - struct cyc2ns_data *head; /* 48 + 8 = 56 */ - struct cyc2ns_data *tail; /* 56 + 8 = 64 */ -}; /* exactly fits one cacheline */ - -static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); - -struct cyc2ns_data *cyc2ns_read_begin(void) -{ - struct cyc2ns_data *head; - - preempt_disable(); - - head = this_cpu_read(cyc2ns.head); - /* - * Ensure we observe the entry when we observe the pointer to it. - * matches the wmb from cyc2ns_write_end(). - */ - smp_read_barrier_depends(); - head->__count++; - barrier(); + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_t seq; /* 32 + 4 = 36 */ - return head; -} +}; /* fits one cacheline */ -void cyc2ns_read_end(struct cyc2ns_data *head) -{ - barrier(); - /* - * If we're the outer most nested read; update the tail pointer - * when we're done. This notifies possible pending writers - * that we've observed the head pointer and that the other - * entry is now free. - */ - if (!--head->__count) { - /* - * x86-TSO does not reorder writes with older reads; - * therefore once this write becomes visible to another - * cpu, we must be finished reading the cyc2ns_data. - * - * matches with cyc2ns_write_begin(). - */ - this_cpu_write(cyc2ns.tail, head); - } - preempt_enable(); -} +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -/* - * Begin writing a new @data entry for @cpu. - * - * Assumes some sort of write side lock; currently 'provided' by the assumption - * that cpufreq will call its notifiers sequentially. - */ -static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +void cyc2ns_read_begin(struct cyc2ns_data *data) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - struct cyc2ns_data *data = c2n->data; + int seq, idx; - if (data == c2n->head) - data++; + preempt_disable_notrace(); - /* XXX send an IPI to @cpu in order to guarantee a read? */ + do { + seq = this_cpu_read(cyc2ns.seq.sequence); + idx = seq & 1; - /* - * When we observe the tail write from cyc2ns_read_end(), - * the cpu must be done with that entry and its safe - * to start writing to it. - */ - while (c2n->tail == data) - cpu_relax(); + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - return data; + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +void cyc2ns_read_end(void) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - /* - * Ensure the @data writes are visible before we publish the - * entry. Matches the data-depencency in cyc2ns_read_begin(). - */ - smp_wmb(); - - ACCESS_ONCE(c2n->head) = data; + preempt_enable_notrace(); } /* @@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_mul = 0; data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; - data->__count = 0; } static void cyc2ns_init(int cpu) @@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu) cyc2ns_data_init(&c2n->data[0]); cyc2ns_data_init(&c2n->data[1]); - c2n->head = c2n->data; - c2n->tail = c2n->data; + seqcount_init(&c2n->seq); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data, *tail; + struct cyc2ns_data data; unsigned long long ns; - /* - * See cyc2ns_read_*() for details; replicated in order to avoid - * an extra few instructions that came with the abstraction. - * Notable, it allows us to only do the __count and tail update - * dance when its actually needed. - */ - - preempt_disable_notrace(); - data = this_cpu_read(cyc2ns.head); - tail = this_cpu_read(cyc2ns.tail); - - if (likely(data == tail)) { - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - } else { - data->__count++; - - barrier(); - - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); - barrier(); + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - if (!--data->__count) - this_cpu_write(cyc2ns.tail, data); - } - preempt_enable_notrace(); + cyc2ns_read_end(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { - unsigned long long tsc_now, ns_now; - struct cyc2ns_data *data; + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; unsigned long flags; local_irq_save(flags); @@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) if (!khz) goto done; - data = cyc2ns_write_begin(cpu); - - tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ - if (data->cyc2ns_shift == 32) { - data->cyc2ns_shift = 31; - data->cyc2ns_mul >>= 1; + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; } - data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); + + c2n = per_cpu_ptr(&cyc2ns, cpu); - cyc2ns_write_end(cpu, data); + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } + /* * Scheduler clock - returns current time in nanosec units. */ @@ -374,6 +273,8 @@ static int __init tsc_setup(char *str) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); return 1; } @@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ - /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency * changes. * @@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); } return 0; @@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) pr_info("Marking TSC unstable due to clocksource watchdog\n"); } +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = { .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) @@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; + int cpu; /* Don't bother refining TSC on unstable systems */ if (check_tsc_unstable()) @@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource); void __init tsc_init(void) { - u64 lpj; + u64 lpj, cyc; int cpu; if (!boot_cpu_has(X86_FEATURE_TSC)) { @@ -1390,9 +1305,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ + cyc = rdtsc(); for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu, cyc); } if (tsc_disabled > 0) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 728f75378475..7842371bc9e4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -71,13 +71,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. - * - * But we always force positive ADJUST values. Otherwise the TSC - * deadline timer creates an interrupt storm. We also have to - * prevent values > 0x7FFFFFFF as those wreckage the timer as well. */ - if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || - (bootval > 0x7FFFFFFF)) { + if (bootcpu && bootval != 0) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); wrmsrl(MSR_IA32_TSC_ADJUST, 0); @@ -451,20 +446,6 @@ retry: */ cur->adjusted += cur_max_warp; - /* - * TSC deadline timer stops working or creates an interrupt storm - * with adjust values < 0 and > x07ffffff. - * - * To allow adjust values > 0x7FFFFFFF we need to disable the - * deadline timer and use the local APIC timer, but that requires - * more intrusive changes and we do not have any useful information - * from Intel about the underlying HW wreckage yet. - */ - if (cur->adjusted < 0) - cur->adjusted = 0; - if (cur->adjusted > 0x7FFFFFFF) - cur->adjusted = 0x7FFFFFFF; - pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ca5d2b93385c..7dd53fbf2f56 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -48,6 +48,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/mmu_context.h> #include "trace.h" #include "pmu.h" @@ -596,6 +597,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { @@ -5012,12 +5014,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr0, cr4; + unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->host_state.vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); @@ -8820,7 +8829,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + unsigned long debugctlmsr, cr3, cr4; /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ @@ -8842,6 +8851,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->host_state.vmcs_host_cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 5e044d506b7a..a179254a5122 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) #ifdef CONFIG_MODIFY_LDT_SYSCALL seg >>= 3; mutex_lock(¤t->mm->context.lock); - if (current->mm->context.ldt && seg < current->mm->context.ldt->size) + if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries) ret = current->mm->context.ldt->entries[seg]; mutex_unlock(¤t->mm->context.lock); #endif diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index bce6990b1d81..0470826d2bdc 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) { #ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; + pgd_t *start = (pgd_t *) &init_top_pgt; #else pgd_t *start = swapper_pg_dir; #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8ad91a01cbc8..2a1fa10c6a98 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address) * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ - pgd_paddr = read_cr3(); + pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; @@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; @@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); + pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -555,7 +555,7 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; @@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, pgd_t *pgd; pte_t *pte; - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 456dfdfd2249..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Lockless get_user_pages_fast for x86 - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/memremap.h> - -#include <asm/mmu_context.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X86_PAE - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a pte will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - -/* - * 'pteval' can come from a pte, pmd, pud or p4d. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 4 types. - */ -static inline int pte_allows_gup(unsigned long pteval, int write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - /* Check memory protection keys permissions. */ - if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) - return 0; - - return 1; -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; - pte_t *ptep, *ptem; - - /* - * Keep the original mapped PTE value (ptem) around since we - * might increment ptep off the end of the page when finishing - * our loop iteration. - */ - ptem = ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - break; - - if (!pte_allows_gup(pte_val(pte), write)) - break; - - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - break; - } - } else if (pte_special(pte)) - break; - - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - put_dev_pagemap(pgmap); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - if (addr == end) - ret = 1; - pte_unmap(ptem); - - return ret; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON_PAGE(page != compound_head(page), page); - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - return 0; - } - SetPageReferenced(page); - pages[*nr] = page; - get_page(page); - put_dev_pagemap(pgmap); - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - return 1; -} - -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pmd_val(pmd), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); - if (pmd_devmap(pmd)) - return __gup_device_huge_pmd(pmd, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static noinline int gup_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pud_val(pud), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - if (pud_devmap(pud)) - return __gup_device_huge_pud(pud, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&p4d, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp; - - p4dp = p4d_offset(&pgd, addr); - do { - p4d_t p4d = *p4dp; - - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - BUILD_BUG_ON(p4d_large(p4d)); - if (!gup_pud_range(p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - -#ifdef CONFIG_X86_64 - if (end >> __VIRTUAL_MASK_SHIFT) - goto slow_irqon; -#endif - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9b3f9fa5b283..673541eb3b3f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -811,10 +811,8 @@ void __init zone_sizes_init(void) } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { -#ifdef CONFIG_SMP - .active_mm = &init_mm, + .loaded_mm = &init_mm, .state = 0, -#endif .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 95651dc58e09..b863d14e452a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup); * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ +#ifdef CONFIG_X86_5LEVEL +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); + struct page *page; + + /* Check for overflow */ + if (addr < start) + break; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} +#else void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long addr; @@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } +#endif /* * NOTE: This function is marked __ref because it calls __init function @@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, return paddr_last; } +static unsigned long __meminit +phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) +{ + unsigned long paddr_next, paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = p4d_index(vaddr); + + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + + for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d; + pud_t *pud; + + vaddr = (unsigned long)__va(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_p4d(p4d, __p4d(0)); + continue; + } + + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, 0); + paddr_last = phys_pud_init(pud, paddr, + paddr_end, + page_size_mask); + __flush_tlb_all(); + continue; + } + + pud = alloc_low_page(); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + p4d_populate(&init_mm, p4d, pud); + spin_unlock(&init_mm.page_table_lock); + } + __flush_tlb_all(); + + return paddr_last; +} + /* * Create page table mapping for the physical memory for specific physical * addresses. The virtual and physical addresses have to be aligned on PMD level @@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); p4d_t *p4d; - pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - BUILD_BUG_ON(pgd_none(*pgd)); - p4d = p4d_offset(pgd, vaddr); - if (p4d_val(*p4d)) { - pud = (pud_t *)p4d_page_vaddr(*p4d); - paddr_last = phys_pud_init(pud, __pa(vaddr), + if (pgd_val(*pgd)) { + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } - pud = alloc_low_page(); - paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), + p4d = alloc_low_page(); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); spin_lock(&init_mm.page_table_lock); - p4d_populate(&init_mm, p4d, pud); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + pgd_populate(&init_mm, pgd, p4d); + else + p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bbc558b88a88..4c1b5fd0c7ad 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { /* Don't assume we're using swapper_pg_dir at this point */ - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(addr)]; p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0c7d8129bed6..88215ac16b24 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -12,7 +12,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> -extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -109,8 +109,8 @@ void __init kasan_early_init(void) for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); - kasan_map_early_shadow(early_level4_pgt); - kasan_map_early_shadow(init_level4_pgt); + kasan_map_early_shadow(early_top_pgt); + kasan_map_early_shadow(init_top_pgt); } void __init kasan_init(void) @@ -121,8 +121,8 @@ void __init kasan_init(void) register_die_notifier(&kasan_die_notifier); #endif - memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); - load_cr3(early_level4_pgt); + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + load_cr3(early_top_pgt); __flush_tlb_all(); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -148,7 +148,7 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - load_cr3(init_level4_pgt); + load_cr3(init_top_pgt); __flush_tlb_all(); /* diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aed206475aa7..af599167fe3c 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -6,12 +6,12 @@ * * Entropy is generated using the KASLR early boot functions now shared in * the lib directory (originally written by Kees Cook). Randomization is - * done on PGD & PUD page table levels to increase possible addresses. The - * physical memory mapping code was adapted to support PUD level virtual - * addresses. This implementation on the best configuration provides 30,000 - * possible virtual addresses in average for each memory region. An additional - * low memory page is used to ensure each CPU can start with a PGD aligned - * virtual address (for realmode). + * done on PGD & P4D/PUD page table levels to increase possible addresses. + * The physical memory mapping code was adapted to support P4D/PUD level + * virtual addresses. This implementation on the best configuration provides + * 30,000 possible virtual addresses in average for each memory region. + * An additional low memory page is used to ensure each CPU can start with + * a PGD aligned virtual address (for realmode). * * The order of each memory region is not changed. The feature looks at * the available space for the regions based on different configuration @@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 64/* Maximum */ }, + { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, { &vmemmap_base, 1 }, }; @@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - entropy = (rand % (entropy + 1)) & PUD_MASK; + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + entropy = (rand % (entropy + 1)) & P4D_MASK; + else + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - vaddr = round_up(vaddr + 1, PUD_SIZE); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + vaddr = round_up(vaddr + 1, P4D_SIZE); + else + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } -/* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. - */ -void __meminit init_trampoline(void) +static void __meminit init_trampoline_pud(void) { unsigned long paddr, paddr_next; pgd_t *pgd; pud_t *pud_page, *pud_page_tramp; int i; - if (!kaslr_memory_enabled()) { - init_trampoline_default(); - return; - } - pud_page_tramp = alloc_low_page(); paddr = 0; @@ -192,3 +189,49 @@ void __meminit init_trampoline(void) set_pgd(&trampoline_pgd_entry, __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); } + +static void __meminit init_trampoline_p4d(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + p4d_t *p4d_page, *p4d_page_tramp; + int i; + + p4d_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); + + for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d, *p4d_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + p4d_tramp = p4d_page_tramp + p4d_index(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + *p4d_tramp = *p4d; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + init_trampoline_p4d(); + else + init_trampoline_pud(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6e7bedf69af7..b2485d69f7c2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -15,7 +15,7 @@ #include <linux/debugfs.h> /* - * Smarter SMP flushing macros. + * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -28,39 +28,28 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ -#ifdef CONFIG_SMP - -struct flush_tlb_info { - struct mm_struct *flush_mm; - unsigned long flush_start; - unsigned long flush_end; -}; - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ void leave_mm(int cpu) { - struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + + /* + * It's plausible that we're in lazy TLB mode while our mm is init_mm. + * If so, our callers still expect us to flush the TLB, but there + * aren't any user TLB entries in init_mm to worry about. + * + * This needs to happen before any other sanity checks due to + * intel_idle's shenanigans. + */ + if (loaded_mm == &init_mm) + return; + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { - cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); - load_cr3(swapper_pg_dir); - /* - * This gets called in the idle path where RCU - * functions differently. Tracing normally - * uses RCU, so we have to call the tracepoint - * specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } + + switch_mm(NULL, &init_mm, NULL); } EXPORT_SYMBOL_GPL(leave_mm); -#endif /* CONFIG_SMP */ - void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -75,116 +64,94 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); - if (likely(prev != next)) { - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); - } + /* + * NB: The scheduler will call us with prev == next when + * switching from lazy TLB mode to normal mode if active_mm + * isn't changing. When this happens, there is no guarantee + * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - cpumask_set_cpu(cpu, mm_cpumask(next)); + if (real_prev == next) { + /* + * There's nothing to do: we always keep the per-mm control + * regs in sync with cpu_tlbstate.loaded_mm. Just + * sanity-check mm_cpumask. + */ + if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + return; + } + if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. */ - load_cr3(next->pgd); + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + pgd_t *pgd = next->pgd + stack_pgd_index; - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } - /* Load per-mm CR4 state */ - load_mm_cr4(next); + this_cpu_write(cpu_tlbstate.loaded_mm, next); -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); + WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif -} + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + */ + load_cr3(next->pgd); -#ifdef CONFIG_SMP + /* + * This gets called via leave_mm() in the idle path where RCU + * functions differently. Tracing normally uses RCU, so we have to + * call the tracepoint specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + + /* Load per-mm CR4 and LDTR state */ + load_mm_cr4(next); + switch_ldt(real_prev, next); +} /* * The flush IPI assumes that a thread switch happens in this order: @@ -222,69 +189,75 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * write/read ordering problems. */ -/* - * TLB flush funcation: - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ -static void flush_tlb_func(void *info) +static void flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) { - struct flush_tlb_info *f = info; + if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { + leave_mm(smp_processor_id()); + return; + } + + if (f->end == TLB_FLUSH_ALL) { + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); + } else { + unsigned long addr; + unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; + while (addr < f->end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); + trace_tlb_flush(reason, nr_pages); + } +} + +static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +{ + const struct flush_tlb_info *f = info; + + flush_tlb_func_common(f, true, reason); +} + +static void flush_tlb_func_remote(void *info) +{ + const struct flush_tlb_info *f = info; inc_irq_stat(irq_tlb_count); - if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) return; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) { - local_flush_tlb(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); - } else { - unsigned long addr; - unsigned long nr_pages = - (f->flush_end - f->flush_start) / PAGE_SIZE; - addr = f->flush_start; - while (addr < f->flush_end) { - __flush_tlb_single(addr); - addr += PAGE_SIZE; - } - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); - } - } else - leave_mm(smp_processor_id()); - + flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); } void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end) + const struct flush_tlb_info *info) { - struct flush_tlb_info info; - - info.flush_mm = mm; - info.flush_start = start; - info.flush_end = end; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - if (end == TLB_FLUSH_ALL) + if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); else trace_tlb_flush(TLB_REMOTE_SEND_IPI, - (end - start) >> PAGE_SHIFT); + (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { unsigned int cpu; cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); + cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func, - &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func, &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); } /* @@ -302,85 +275,36 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { - unsigned long addr; - /* do a global flush by default */ - unsigned long base_pages_to_flush = TLB_FLUSH_ALL; - - preempt_disable(); - - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - if (base_pages_to_flush > tlb_single_page_flush_ceiling) - base_pages_to_flush = TLB_FLUSH_ALL; - - if (current->active_mm != mm) { - /* Synchronize with switch_mm. */ - smp_mb(); - - goto out; - } + int cpu; - if (!current->mm) { - leave_mm(smp_processor_id()); + struct flush_tlb_info info = { + .mm = mm, + }; - /* Synchronize with switch_mm. */ - smp_mb(); + cpu = get_cpu(); - goto out; - } + /* Synchronize with switch_mm. */ + smp_mb(); - /* - * Both branches below are implicit full barriers (MOV to CR or - * INVLPG) that synchronize with switch_mm. - */ - if (base_pages_to_flush == TLB_FLUSH_ALL) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && + !(vmflag & VM_HUGETLB) && + ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + info.start = start; + info.end = end; } else { - /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - __flush_tlb_single(addr); - } - } - trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); -out: - if (base_pages_to_flush == TLB_FLUSH_ALL) { - start = 0UL; - end = TLB_FLUSH_ALL; + info.start = 0UL; + info.end = TLB_FLUSH_ALL; } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, end); - preempt_enable(); -} -void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) { - /* - * Implicit full barrier (INVLPG) that synchronizes - * with switch_mm. - */ - __flush_tlb_one(start); - } else { - leave_mm(smp_processor_id()); - - /* Synchronize with switch_mm. */ - smp_mb(); - } - } - - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); - - preempt_enable(); + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) + flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } + static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -401,7 +325,7 @@ static void do_kernel_range_flush(void *info) unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) __flush_tlb_single(addr); } @@ -410,16 +334,35 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { struct flush_tlb_info info; - info.flush_start = start; - info.flush_end = end; + info.start = start; + info.end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + struct flush_tlb_info info = { + .mm = NULL, + .start = 0UL, + .end = TLB_FLUSH_ALL, + }; + + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, &batch->cpumask)) + flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); + + put_cpu(); +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -465,5 +408,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); - -#endif /* CONFIG_SMP */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 43b96f5f78ba..f084d8718ac4 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1014,7 +1014,6 @@ static void __init __efi_enter_virtual_mode(void) * necessary relocation fixups for the new virtual addresses. */ efi_runtime_update_mappings(); - efi_dump_pagetable(); /* clean DUMMY object */ efi_delete_dummy_variable(); @@ -1029,6 +1028,8 @@ void __init efi_enter_virtual_mode(void) kexec_enter_virtual_mode(); else __efi_enter_virtual_mode(); + + efi_dump_pagetable(); } /* diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 3481268da3d0..52f7faa1538f 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -44,7 +44,14 @@ int __init efi_alloc_page_tables(void) } void efi_sync_low_kernel_mappings(void) {} -void __init efi_dump_pagetable(void) {} + +void __init efi_dump_pagetable(void) +{ +#ifdef CONFIG_EFI_PGT_DUMP + ptdump_walk_pgd_level(NULL, swapper_pg_dir); +#endif +} + int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index eb8dff15a7f6..9bf72f5bfedb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)read_cr3(); + save_pgd = (pgd_t *)__read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); goto out; } @@ -589,7 +589,10 @@ void __init efi_runtime_update_mappings(void) void __init efi_dump_pagetable(void) { #ifdef CONFIG_EFI_PGT_DUMP - ptdump_walk_pgd_level(NULL, efi_pgd); + if (efi_enabled(EFI_OLD_MEMMAP)) + ptdump_walk_pgd_level(NULL, swapper_pg_dir); + else + ptdump_walk_pgd_level(NULL, efi_pgd); #endif } @@ -646,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map( efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_scratch.prev_cr3 = read_cr3(); + efi_scratch.prev_cr3 = __read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); __flush_tlb_all(); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index e0cf95a83f3f..8a99a2e96537 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -15,12 +15,66 @@ #include <asm/e820/api.h> #include <asm/efi.h> #include <asm/uv/uv.h> +#include <asm/cpu_device_id.h> #define EFI_MIN_RESERVE 5120 #define EFI_DUMMY_GUID \ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) +#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */ +#define QUARK_SECURITY_HEADER_SIZE 0x400 + +/* + * Header prepended to the standard EFI capsule on Quark systems the are based + * on Intel firmware BSP. + * @csh_signature: Unique identifier to sanity check signed module + * presence ("_CSH"). + * @version: Current version of CSH used. Should be one for Quark A0. + * @modulesize: Size of the entire module including the module header + * and payload. + * @security_version_number_index: Index of SVN to use for validation of signed + * module. + * @security_version_number: Used to prevent against roll back of modules. + * @rsvd_module_id: Currently unused for Clanton (Quark). + * @rsvd_module_vendor: Vendor Identifier. For Intel products value is + * 0x00008086. + * @rsvd_date: BCD representation of build date as yyyymmdd, where + * yyyy=4 digit year, mm=1-12, dd=1-31. + * @headersize: Total length of the header including including any + * padding optionally added by the signing tool. + * @hash_algo: What Hash is used in the module signing. + * @cryp_algo: What Crypto is used in the module signing. + * @keysize: Total length of the key data including including any + * padding optionally added by the signing tool. + * @signaturesize: Total length of the signature including including any + * padding optionally added by the signing tool. + * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the + * chain, if there is a next header. + * @rsvd: Reserved, padding structure to required size. + * + * See also QuartSecurityHeader_t in + * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h + * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP + */ +struct quark_security_header { + u32 csh_signature; + u32 version; + u32 modulesize; + u32 security_version_number_index; + u32 security_version_number; + u32 rsvd_module_id; + u32 rsvd_module_vendor; + u32 rsvd_date; + u32 headersize; + u32 hash_algo; + u32 cryp_algo; + u32 keysize; + u32 signaturesize; + u32 rsvd_next_header; + u32 rsvd[2]; +}; + static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; static bool efi_no_storage_paranoia; @@ -504,3 +558,86 @@ bool efi_poweroff_required(void) { return acpi_gbl_reduced_hardware || acpi_no_s5; } + +#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH + +static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, + size_t hdr_bytes) +{ + struct quark_security_header *csh = *pkbuff; + + /* Only process data block that is larger than the security header */ + if (hdr_bytes < sizeof(struct quark_security_header)) + return 0; + + if (csh->csh_signature != QUARK_CSH_SIGNATURE || + csh->headersize != QUARK_SECURITY_HEADER_SIZE) + return 1; + + /* Only process data block if EFI header is included */ + if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE + + sizeof(efi_capsule_header_t)) + return 0; + + pr_debug("Quark security header detected\n"); + + if (csh->rsvd_next_header != 0) { + pr_err("multiple Quark security headers not supported\n"); + return -EINVAL; + } + + *pkbuff += csh->headersize; + cap_info->total_size = csh->headersize; + + /* + * Update the first page pointer to skip over the CSH header. + */ + cap_info->pages[0] += csh->headersize; + + return 1; +} + +#define ICPU(family, model, quirk_handler) \ + { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \ + (unsigned long)&quirk_handler } + +static const struct x86_cpu_id efi_capsule_quirk_ids[] = { + ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */ + { } +}; + +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes) +{ + int (*quirk_handler)(struct capsule_info *, void **, size_t); + const struct x86_cpu_id *id; + int ret; + + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + cap_info->total_size = 0; + + id = x86_match_cpu(efi_capsule_quirk_ids); + if (id) { + /* + * The quirk handler is supposed to return + * - a value > 0 if the setup should continue, after advancing + * kbuff as needed + * - 0 if not enough hdr_bytes are available yet + * - a negative error code otherwise + */ + quirk_handler = (typeof(quirk_handler))id->driver_data; + ret = quirk_handler(cap_info, &kbuff, hdr_bytes); + if (ret <= 0) + return ret; + } + + memcpy(&cap_info->header, kbuff, sizeof(cap_info->header)); + + cap_info->total_size += cap_info->header.imagesize; + + return __efi_capsule_setup_info(cap_info); +} + +#endif diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index c5350fd27d70..0668aaff8bfe 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { - void *pgd_addr = __va(read_cr3()); + void *pgd_addr = __va(read_cr3_pa()); /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 42e65fee5673..2983faab5b18 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) */ static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long ns; - ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); + ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); + cyc2ns_read_end(); - cyc2ns_read_end(data); return ns; } @@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long cyc; - cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + cyc2ns_read_begin(&data); + cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul; + cyc2ns_read_end(); - cyc2ns_read_end(data); return cyc; } @@ -1121,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - unsigned int cpu) + const struct flush_tlb_info *info) { + unsigned int cpu = smp_processor_id(); int locals = 0, remotes = 0, hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; @@ -1179,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, record_send_statistics(stat, locals, hubs, remotes, bau_desc); - if (!end || (end - start) <= PAGE_SIZE) - address = start; + if (!info->end || (info->end - info->start) <= PAGE_SIZE) + address = info->start; else address = TLB_FLUSH_ALL; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6b05a9219ea2..78459a6d455a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt) */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); + ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); #ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 2ab1c5059a61..f2598d81cd55 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -150,7 +150,8 @@ static int relocate_restore_code(void) memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_large(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 2a2d89d39af6..bb026699ad19 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,13 +1,3 @@ -config MCE_AMD_INJ - tristate "Simple MCE injection interface for AMD processors" - depends on RAS && X86_MCE && DEBUG_FS && AMD_NB - default n - help - This is a simple debugfs interface to inject MCEs and test different - aspects of the MCE handling code. - - WARNING: Do not even assume this interface is staying stable! - config RAS_CEC bool "Correctable Errors Collector" depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS @@ -20,4 +10,3 @@ config RAS_CEC Bear in mind that this is absolutely useless if your platform doesn't have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. - diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile deleted file mode 100644 index 5f94546db280..000000000000 --- a/arch/x86/ras/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c deleted file mode 100644 index 8730c2882fff..000000000000 --- a/arch/x86/ras/mce_amd_inj.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * A simple MCE injection facility for testing different aspects of the RAS - * code. This driver should be built as module so that it can be loaded - * on production kernels for testing purposes. - * - * This file may be distributed under the terms of the GNU General Public - * License version 2. - * - * Copyright (c) 2010-15: Borislav Petkov <bp@alien8.de> - * Advanced Micro Devices Inc. - */ - -#include <linux/kobject.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/string.h> -#include <linux/uaccess.h> -#include <linux/pci.h> - -#include <asm/mce.h> -#include <asm/smp.h> -#include <asm/amd_nb.h> -#include <asm/irq_vectors.h> - -#include "../kernel/cpu/mcheck/mce-internal.h" - -/* - * Collect all the MCi_XXX settings - */ -static struct mce i_mce; -static struct dentry *dfs_inj; - -static u8 n_banks; - -#define MAX_FLAG_OPT_SIZE 3 -#define NBCFG 0x44 - -enum injection_type { - SW_INJ = 0, /* SW injection, simply decode the error */ - HW_INJ, /* Trigger a #MC */ - DFR_INT_INJ, /* Trigger Deferred error interrupt */ - THR_INT_INJ, /* Trigger threshold interrupt */ - N_INJ_TYPES, -}; - -static const char * const flags_options[] = { - [SW_INJ] = "sw", - [HW_INJ] = "hw", - [DFR_INT_INJ] = "df", - [THR_INT_INJ] = "th", - NULL -}; - -/* Set default injection to SW_INJ */ -static enum injection_type inj_type = SW_INJ; - -#define MCE_INJECT_SET(reg) \ -static int inj_##reg##_set(void *data, u64 val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - m->reg = val; \ - return 0; \ -} - -MCE_INJECT_SET(status); -MCE_INJECT_SET(misc); -MCE_INJECT_SET(addr); -MCE_INJECT_SET(synd); - -#define MCE_INJECT_GET(reg) \ -static int inj_##reg##_get(void *data, u64 *val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - *val = m->reg; \ - return 0; \ -} - -MCE_INJECT_GET(status); -MCE_INJECT_GET(misc); -MCE_INJECT_GET(addr); -MCE_INJECT_GET(synd); - -DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); - -/* - * Caller needs to be make sure this cpu doesn't disappear - * from under us, i.e.: get_cpu/put_cpu. - */ -static int toggle_hw_mce_inject(unsigned int cpu, bool enable) -{ - u32 l, h; - int err; - - err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); - if (err) { - pr_err("%s: error reading HWCR\n", __func__); - return err; - } - - enable ? (l |= BIT(18)) : (l &= ~BIT(18)); - - err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); - if (err) - pr_err("%s: error writing HWCR\n", __func__); - - return err; -} - -static int __set_inj(const char *buf) -{ - int i; - - for (i = 0; i < N_INJ_TYPES; i++) { - if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { - inj_type = i; - return 0; - } - } - return -EINVAL; -} - -static ssize_t flags_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE]; - int n; - - n = sprintf(buf, "%s\n", flags_options[inj_type]); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); -} - -static ssize_t flags_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE], *__buf; - int err; - - if (cnt > MAX_FLAG_OPT_SIZE) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt - 1] = 0; - - /* strip whitespace */ - __buf = strstrip(buf); - - err = __set_inj(__buf); - if (err) { - pr_err("%s: Invalid flags value: %s\n", __func__, __buf); - return err; - } - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations flags_fops = { - .read = flags_read, - .write = flags_write, - .llseek = generic_file_llseek, -}; - -/* - * On which CPU to inject? - */ -MCE_INJECT_GET(extcpu); - -static int inj_extcpu_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= nr_cpu_ids || !cpu_online(val)) { - pr_err("%s: Invalid CPU: %llu\n", __func__, val); - return -EINVAL; - } - m->extcpu = val; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); - -static void trigger_mce(void *info) -{ - asm volatile("int $18"); -} - -static void trigger_dfr_int(void *info) -{ - asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); -} - -static void trigger_thr_int(void *info) -{ - asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); -} - -static u32 get_nbc_for_node(int node_id) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 cores_per_node; - - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - - return cores_per_node * node_id; -} - -static void toggle_nb_mca_mst_cpu(u16 nid) -{ - struct pci_dev *F3 = node_to_amd_nb(nid)->misc; - u32 val; - int err; - - if (!F3) - return; - - err = pci_read_config_dword(F3, NBCFG, &val); - if (err) { - pr_err("%s: Error reading F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); - return; - } - - if (val & BIT(27)) - return; - - pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", - __func__); - - val |= BIT(27); - err = pci_write_config_dword(F3, NBCFG, val); - if (err) - pr_err("%s: Error writing F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); -} - -static void prepare_msrs(void *info) -{ - struct mce m = *(struct mce *)info; - u8 b = m.bank; - - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); - } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); - } - - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); - } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); - } -} - -static void do_inject(void) -{ - u64 mcg_status = 0; - unsigned int cpu = i_mce.extcpu; - u8 b = i_mce.bank; - - rdtscll(i_mce.tsc); - - if (i_mce.misc) - i_mce.status |= MCI_STATUS_MISCV; - - if (i_mce.synd) - i_mce.status |= MCI_STATUS_SYNDV; - - if (inj_type == SW_INJ) { - mce_inject_log(&i_mce); - return; - } - - /* prep MCE global settings for the injection */ - mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; - - if (!(i_mce.status & MCI_STATUS_PCC)) - mcg_status |= MCG_STATUS_RIPV; - - /* - * Ensure necessary status bits for deferred errors: - * - MCx_STATUS[Deferred]: make sure it is a deferred error - * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC - */ - if (inj_type == DFR_INT_INJ) { - i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); - } - - /* - * For multi node CPUs, logging and reporting of bank 4 errors happens - * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for - * Fam10h and later BKDGs. - */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && - b == 4 && - boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); - } - - get_online_cpus(); - if (!cpu_online(cpu)) - goto err; - - toggle_hw_mce_inject(cpu, true); - - i_mce.mcgstatus = mcg_status; - i_mce.inject_flags = inj_type; - smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); - - toggle_hw_mce_inject(cpu, false); - - switch (inj_type) { - case DFR_INT_INJ: - smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); - break; - case THR_INT_INJ: - smp_call_function_single(cpu, trigger_thr_int, NULL, 0); - break; - default: - smp_call_function_single(cpu, trigger_mce, NULL, 0); - } - -err: - put_online_cpus(); - -} - -/* - * This denotes into which bank we're injecting and triggers - * the injection, at the same time. - */ -static int inj_bank_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); - return -EINVAL; - } - - m->bank = val; - do_inject(); - - return 0; -} - -MCE_INJECT_GET(bank); - -DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); - -static const char readme_msg[] = -"Description of the files and their usages:\n" -"\n" -"Note1: i refers to the bank number below.\n" -"Note2: See respective BKDGs for the exact bit definitions of the files below\n" -"as they mirror the hardware registers.\n" -"\n" -"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" -"\t attributes of the error which caused the MCE.\n" -"\n" -"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" -"\t used for error thresholding purposes and its validity is indicated by\n" -"\t MCi_STATUS[MiscV].\n" -"\n" -"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" -"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" -"\n" -"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" -"\t associated with the error.\n" -"\n" -"cpu:\t The CPU to inject the error on.\n" -"\n" -"bank:\t Specify the bank you want to inject the error into: the number of\n" -"\t banks in a processor varies and is family/model-specific, therefore, the\n" -"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" -"\t injection.\n" -"\n" -"flags:\t Injection type to be performed. Writing to this file will trigger a\n" -"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" -"\t for AMD processors.\n" -"\n" -"\t Allowed error injection types:\n" -"\t - \"sw\": Software error injection. Decode error to a human-readable \n" -"\t format only. Safe to use.\n" -"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" -"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" -"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" -"\t before injecting.\n" -"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" -"\t error APIC interrupt handler to handle the error if the feature is \n" -"\t is present in hardware. \n" -"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" -"\t APIC interrupt handler to handle the error. \n" -"\n"; - -static ssize_t -inj_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static const struct file_operations readme_fops = { - .read = inj_readme_read, -}; - -static struct dfs_node { - char *name; - struct dentry *d; - const struct file_operations *fops; - umode_t perm; -} dfs_fls[] = { - { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, -}; - -static int __init init_mce_inject(void) -{ - unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; - - dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; -} - -static void __exit exit_mce_inject(void) -{ - - debugfs_remove_recursive(dfs_inj); - dfs_inj = NULL; - - memset(&dfs_fls, 0, sizeof(dfs_fls)); -} -module_init(init_mce_inject); -module_exit(exit_mce_inject); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>"); -MODULE_AUTHOR("AMD Inc."); -MODULE_DESCRIPTION("MCE injection facility for RAS testing"); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index a163a90af4aa..cd4be19c36dc 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -102,7 +102,7 @@ static void __init setup_real_mode(void) trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; - trampoline_pgd[511] = init_level4_pgt[511].pgd; + trampoline_pgd[511] = init_top_pgt[511].pgd; #endif } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1f386d7fdf70..1d7a7213a310 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } - -#ifdef CONFIG_SMP -/* Another cpu may still have their %cr3 pointing at the pagetable, so - we need to repoint it somewhere else before we can unpin it. */ -static void drop_other_mm_ref(void *info) +static void drop_mm_ref_this_cpu(void *info) { struct mm_struct *mm = info; - struct mm_struct *active_mm; - - active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) leave_mm(smp_processor_id()); - /* If this cpu still has a stale cr3 reference, then make sure - it has been flushed. */ + /* + * If this cpu still has a stale cr3 reference, then make sure + * it has been flushed. + */ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) - load_cr3(swapper_pg_dir); + xen_mc_flush(); } +#ifdef CONFIG_SMP +/* + * Another cpu may still have their %cr3 pointing at the pagetable, so + * we need to repoint it somewhere else before we can unpin it. + */ static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_var_t mask; unsigned cpu; - if (current->active_mm == mm) { - if (current->mm == mm) - load_cr3(swapper_pg_dir); - else - leave_mm(smp_processor_id()); - } + drop_mm_ref_this_cpu(mm); /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { @@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm) if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; - smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } cpumask_copy(mask, mm_cpumask(mm)); - /* It's possible that a vcpu may have a stale reference to our - cr3, because its in lazy mode, and it hasn't yet flushed - its set of pending hypercalls yet. In this case, we can - look at its actual current cr3 value, and force it to flush - if needed. */ + /* + * It's possible that a vcpu may have a stale reference to our + * cr3, because its in lazy mode, and it hasn't yet flushed + * its set of pending hypercalls yet. In this case, we can + * look at its actual current cr3 value, and force it to flush + * if needed. + */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); } - if (!cpumask_empty(mask)) - smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1); free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) { - if (current->active_mm == mm) - load_cr3(swapper_pg_dir); + drop_mm_ref_this_cpu(mm); } #endif @@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr) } static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long start, - unsigned long end) + const struct flush_tlb_info *info) { struct { struct mmuext_op op; @@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); + trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (info->end != TLB_FLUSH_ALL && + (info->end - info->start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = start; + args->op.arg1.linear_addr = info->start; } MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); @@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3) * At the start of the day - when Xen launches a guest, it has already * built pagetables for the guest. We diligently look over them * in xen_setup_kernel_pagetable and graft as appropriate them in the - * init_level4_pgt and its friends. Then when we are happy we load - * the new init_level4_pgt - and continue on. + * init_top_pgt and its friends. Then when we are happy we load + * the new init_top_pgt - and continue on. * * The generic code starts (start_kernel) and 'init_mem_mapping' sets * up the rest of the pagetables. When it has completed it loads the cr3. @@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) pt_end = pt_base + xen_start_info->nr_pt_frames; /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); + init_top_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ /* L4[272] -> level3_ident_pgt */ /* L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(init_top_pgt); /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); @@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Copy the initial P->M table mappings if necessary. */ i = pgd_index(xen_start_info->mfn_list); if (i && i < pgd_index(__START_KERNEL_map)) - init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(init_top_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); @@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); + PFN_DOWN(__pa_symbol(init_top_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * attach it to, so make sure we just set kernel pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); + __xen_write_cr3(true, __pa(init_top_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are @@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) pmd_t pmd; pte_t pte; - pa = read_cr3(); + pa = read_cr3_pa(); pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * sizeof(pgd))); if (!pgd_present(pgd)) @@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void) pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - pgd = __va(read_cr3()); + pgd = __va(read_cr3_pa()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); idx_p4d = 0; save_pud = n_pud; @@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) { unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); - BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(read_cr3_pa() != __pa(initial_page_table)); BUG_ON(cr3 != __pa(swapper_pg_dir)); /* diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S index 5e246716d58f..e1a5fbeae08d 100644 --- a/arch/x86/xen/xen-pvh.S +++ b/arch/x86/xen/xen-pvh.S @@ -87,7 +87,7 @@ ENTRY(pvh_start_xen) wrmsr /* Enable pre-constructed page tables. */ - mov $_pa(init_level4_pgt), %eax + mov $_pa(init_top_pgt), %eax mov %eax, %cr3 mov $(X86_CR0_PG | X86_CR0_PE), %eax mov %eax, %cr0 diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index cc23e9ecc6bb..30f6290109d4 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -25,7 +25,6 @@ generic-y += preempt.h generic-y += resource.h generic-y += rwsem.h generic-y += sections.h -generic-y += siginfo.h generic-y += statfs.h generic-y += termios.h generic-y += topology.h diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild index b15bf6bc0e94..4cb0d2f8868c 100644 --- a/arch/xtensa/include/uapi/asm/Kbuild +++ b/arch/xtensa/include/uapi/asm/Kbuild @@ -1,2 +1,3 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +generic-y += siginfo.h diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 668c1056f9e4..fd524a54d2ab 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -187,7 +187,7 @@ void __init time_init(void) local_timer_setup(0); setup_irq(this_cpu_ptr(&ccount_timer)->evt.irq, &timer_irqaction); sched_clock_register(ccount_sched_clock_read, 32, ccount_freq); - clocksource_probe(); + timer_probe(); } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 05dfa3f270ae..ced2b000ca02 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -941,14 +941,14 @@ static bool reorder_tags_to_front(struct list_head *list) return first != NULL; } -static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - list_del(&wait->task_list); + list_del(&wait->entry); clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); blk_mq_run_hw_queue(hctx, true); return 1; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 17676f4d7fd1..6a9a0f03a67b 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) } static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, - wait_queue_t *wait, unsigned long rw) + wait_queue_entry_t *wait, unsigned long rw) { /* * inc it here even if disabled, since we'll dec it at completion. @@ -520,7 +520,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, * in line to be woken up, wait for our turn. */ if (waitqueue_active(&rqw->wait) && - rqw->wait.task_list.next != &wait->task_list) + rqw->wait.head.next != &wait->entry) return false; return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a9f6fd3fab8e..f58cab82105b 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -99,7 +99,7 @@ struct kyber_hctx_data { struct list_head rqs[KYBER_NUM_DOMAINS]; unsigned int cur_domain; unsigned int batching; - wait_queue_t domain_wait[KYBER_NUM_DOMAINS]; + wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; @@ -385,7 +385,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); - INIT_LIST_HEAD(&khd->domain_wait[i].task_list); + INIT_LIST_HEAD(&khd->domain_wait[i].entry); atomic_set(&khd->wait_index[i], 0); } @@ -503,12 +503,12 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd, } } -static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags, +static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } @@ -519,7 +519,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, { unsigned int sched_domain = khd->cur_domain; struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; - wait_queue_t *wait = &khd->domain_wait[sched_domain]; + wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; struct sbq_wait_state *ws; int nr; @@ -532,7 +532,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ - if (list_empty_careful(&wait->task_list)) { + if (list_empty_careful(&wait->entry)) { init_waitqueue_func_entry(wait, kyber_domain_wake); wait->private = hctx; ws = sbq_wait_ptr(domain_tokens, @@ -730,9 +730,9 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ { \ struct blk_mq_hw_ctx *hctx = data; \ struct kyber_hctx_data *khd = hctx->sched_data; \ - wait_queue_t *wait = &khd->domain_wait[domain]; \ + wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ \ - seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list)); \ + seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ return 0; \ } KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 980515e029fa..0968816f5755 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -89,14 +89,14 @@ bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); /* - * All error sources notified with SCI shares one notifier function, - * so they need to be linked and checked one by one. This is applied - * to NMI too. + * All error sources notified with HED (Hardware Error Device) share a + * single notifier callback, so they need to be linked and checked one + * by one. This holds true for NMI too. * * RCU is used for these lists, so ghes_list_mutex is only used for * list changing, not for traversing. */ -static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); /* @@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data) return IRQ_HANDLED; } -static int ghes_notify_sci(struct notifier_block *this, - unsigned long event, void *data) +static int ghes_notify_hed(struct notifier_block *this, unsigned long event, + void *data) { struct ghes *ghes; int ret = NOTIFY_DONE; rcu_read_lock(); - list_for_each_entry_rcu(ghes, &ghes_sci, list) { + list_for_each_entry_rcu(ghes, &ghes_hed, list) { if (!ghes_proc(ghes)) ret = NOTIFY_OK; } @@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } -static struct notifier_block ghes_notifier_sci = { - .notifier_call = ghes_notify_sci, +static struct notifier_block ghes_notifier_hed = { + .notifier_call = ghes_notify_hed, }; #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_POLLED: case ACPI_HEST_NOTIFY_EXTERNAL: case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: break; + case ACPI_HEST_NOTIFY_NMI: if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) { pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n", @@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev) goto err_edac_unreg; } break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); - if (list_empty(&ghes_sci)) - register_acpi_hed_notifier(&ghes_notifier_sci); - list_add_rcu(&ghes->list, &ghes_sci); + if (list_empty(&ghes_hed)) + register_acpi_hed_notifier(&ghes_notifier_hed); + list_add_rcu(&ghes->list, &ghes_hed); mutex_unlock(&ghes_list_mutex); break; + case ACPI_HEST_NOTIFY_NMI: ghes_nmi_add(ghes); break; @@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_EXTERNAL: free_irq(ghes->irq, ghes); break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); - if (list_empty(&ghes_sci)) - unregister_acpi_hed_notifier(&ghes_notifier_sci); + if (list_empty(&ghes_hed)) + unregister_acpi_hed_notifier(&ghes_notifier_hed); mutex_unlock(&ghes_list_mutex); synchronize_rcu(); break; + case ACPI_HEST_NOTIFY_NMI: ghes_nmi_remove(ghes); break; diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 919be0aa2578..240544253ccd 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device, struct acpi_pci_root *root; acpi_handle handle = device->handle; int no_aspm = 0; - bool hotadd = system_state != SYSTEM_BOOTING; + bool hotadd = system_state == SYSTEM_RUNNING; root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL); if (!root) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 8697a82bd465..591d1dd3f04e 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -268,9 +268,9 @@ static int acpi_processor_start(struct device *dev) return -ENODEV; /* Protect against concurrent CPU hotplug operations */ - get_online_cpus(); + cpu_hotplug_disable(); ret = __acpi_processor_start(device); - put_online_cpus(); + cpu_hotplug_enable(); return ret; } diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 3de34633f7f9..7f9aff4b8d62 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -909,6 +909,13 @@ static long __acpi_processor_get_throttling(void *data) return pr->throttling.acpi_processor_get_throttling(pr); } +static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) +{ + if (direct || (is_percpu_thread() && cpu == smp_processor_id())) + return fn(arg); + return work_on_cpu(cpu, fn, arg); +} + static int acpi_processor_get_throttling(struct acpi_processor *pr) { if (!pr) @@ -926,7 +933,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr) if (!cpu_online(pr->id)) return -ENODEV; - return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr); + return call_on_cpu(pr->id, __acpi_processor_get_throttling, pr, false); } static int acpi_processor_get_fadt_info(struct acpi_processor *pr) @@ -1076,13 +1083,6 @@ static long acpi_processor_throttling_fn(void *data) arg->target_state, arg->force); } -static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) -{ - if (direct) - return fn(arg); - return work_on_cpu(cpu, fn, arg); -} - static int __acpi_processor_set_throttling(struct acpi_processor *pr, int state, bool force, bool direct) { diff --git a/drivers/base/node.c b/drivers/base/node.c index 5548f9686016..0440d95c9b5b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn) if (!pfn_valid_within(pfn)) return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - if (system_state == SYSTEM_BOOTING) + if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); #endif page = pfn_to_page(pfn); diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c index 8d3d9175d891..b280d466f05b 100644 --- a/drivers/bluetooth/btmrvl_main.c +++ b/drivers/bluetooth/btmrvl_main.c @@ -602,7 +602,7 @@ static int btmrvl_service_main_thread(void *data) struct btmrvl_thread *thread = data; struct btmrvl_private *priv = thread->priv; struct btmrvl_adapter *adapter = priv->adapter; - wait_queue_t wait; + wait_queue_entry_t wait; struct sk_buff *skb; ulong flags; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 8102ee7b3247..ccd239ab879f 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -541,15 +541,6 @@ config HANGCHECK_TIMER out to lunch past a certain margin. It can reboot the system or merely print a warning. -config MMTIMER - tristate "MMTIMER Memory mapped RTC for SGI Altix" - depends on IA64_GENERIC || IA64_SGI_SN2 - depends on POSIX_TIMERS - default y - help - The mmtimer device allows direct userspace access to the - Altix system timer. - config UV_MMTIMER tristate "UV_MMTIMER Memory mapped RTC for SGI UV" depends on X86_UV diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 6e6c244a66a0..53e33720818c 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MSPEC) += mspec.o -obj-$(CONFIG_MMTIMER) += mmtimer.o obj-$(CONFIG_UV_MMTIMER) += uv_mmtimer.o obj-$(CONFIG_IBM_BSR) += bsr.o obj-$(CONFIG_SGI_MBCS) += mbcs.o diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index d165af8abe36..a5c6cfe71a8e 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -821,7 +821,7 @@ static ssize_t ipmi_read(struct file *file, loff_t *ppos) { int rv = 0; - wait_queue_t wait; + wait_queue_entry_t wait; if (count <= 0) return 0; diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c deleted file mode 100644 index 0e7fcb04f01e..000000000000 --- a/drivers/char/mmtimer.c +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Timer device implementation for SGI SN platforms. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 2001-2006 Silicon Graphics, Inc. All rights reserved. - * - * This driver exports an API that should be supportable by any HPET or IA-PC - * multimedia timer. The code below is currently specific to the SGI Altix - * SHub RTC, however. - * - * 11/01/01 - jbarnes - initial revision - * 9/10/04 - Christoph Lameter - remove interrupt support for kernel inclusion - * 10/1/04 - Christoph Lameter - provide posix clock CLOCK_SGI_CYCLE - * 10/13/04 - Christoph Lameter, Dimitri Sivanich - provide timer interrupt - * support via the posix timer interface - */ - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/ioctl.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/mmtimer.h> -#include <linux/miscdevice.h> -#include <linux/posix-timers.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/math64.h> -#include <linux/mutex.h> -#include <linux/slab.h> - -#include <linux/uaccess.h> -#include <asm/sn/addrs.h> -#include <asm/sn/intr.h> -#include <asm/sn/shub_mmr.h> -#include <asm/sn/nodepda.h> -#include <asm/sn/shubio.h> - -MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>"); -MODULE_DESCRIPTION("SGI Altix RTC Timer"); -MODULE_LICENSE("GPL"); - -/* name of the device, usually in /dev */ -#define MMTIMER_NAME "mmtimer" -#define MMTIMER_DESC "SGI Altix RTC Timer" -#define MMTIMER_VERSION "2.1" - -#define RTC_BITS 55 /* 55 bits for this implementation */ - -static struct k_clock sgi_clock; - -extern unsigned long sn_rtc_cycles_per_second; - -#define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC)) - -#define rtc_time() (*RTC_COUNTER_ADDR) - -static DEFINE_MUTEX(mmtimer_mutex); -static long mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg); -static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma); - -/* - * Period in femtoseconds (10^-15 s) - */ -static unsigned long mmtimer_femtoperiod = 0; - -static const struct file_operations mmtimer_fops = { - .owner = THIS_MODULE, - .mmap = mmtimer_mmap, - .unlocked_ioctl = mmtimer_ioctl, - .llseek = noop_llseek, -}; - -/* - * We only have comparison registers RTC1-4 currently available per - * node. RTC0 is used by SAL. - */ -/* Check for an RTC interrupt pending */ -static int mmtimer_int_pending(int comparator) -{ - if (HUB_L((unsigned long *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)) & - SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator) - return 1; - else - return 0; -} - -/* Clear the RTC interrupt pending bit */ -static void mmtimer_clr_int_pending(int comparator) -{ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS), - SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator); -} - -/* Setup timer on comparator RTC1 */ -static void mmtimer_setup_int_0(int cpu, u64 expires) -{ - u64 val; - - /* Disable interrupt */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 0UL); - - /* Initialize comparator value */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), -1L); - - /* Clear pending bit */ - mmtimer_clr_int_pending(0); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC1_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC1_INT_CONFIG_PID_SHFT); - - /* Set configuration */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_CONFIG), val); - - /* Enable RTC interrupts */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 1UL); - - /* Initialize comparator value */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), expires); - - -} - -/* Setup timer on comparator RTC2 */ -static void mmtimer_setup_int_1(int cpu, u64 expires) -{ - u64 val; - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 0UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), -1L); - - mmtimer_clr_int_pending(1); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC2_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC2_INT_CONFIG_PID_SHFT); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_CONFIG), val); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 1UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), expires); -} - -/* Setup timer on comparator RTC3 */ -static void mmtimer_setup_int_2(int cpu, u64 expires) -{ - u64 val; - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 0UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), -1L); - - mmtimer_clr_int_pending(2); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC3_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC3_INT_CONFIG_PID_SHFT); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_CONFIG), val); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 1UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), expires); -} - -/* - * This function must be called with interrupts disabled and preemption off - * in order to insure that the setup succeeds in a deterministic time frame. - * It will check if the interrupt setup succeeded. - */ -static int mmtimer_setup(int cpu, int comparator, unsigned long expires, - u64 *set_completion_time) -{ - switch (comparator) { - case 0: - mmtimer_setup_int_0(cpu, expires); - break; - case 1: - mmtimer_setup_int_1(cpu, expires); - break; - case 2: - mmtimer_setup_int_2(cpu, expires); - break; - } - /* We might've missed our expiration time */ - *set_completion_time = rtc_time(); - if (*set_completion_time <= expires) - return 1; - - /* - * If an interrupt is already pending then its okay - * if not then we failed - */ - return mmtimer_int_pending(comparator); -} - -static int mmtimer_disable_int(long nasid, int comparator) -{ - switch (comparator) { - case 0: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC1_INT_ENABLE, 0UL); - break; - case 1: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC2_INT_ENABLE, 0UL); - break; - case 2: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC3_INT_ENABLE, 0UL); - break; - default: - return -EFAULT; - } - return 0; -} - -#define COMPARATOR 1 /* The comparator to use */ - -#define TIMER_OFF 0xbadcabLL /* Timer is not setup */ -#define TIMER_SET 0 /* Comparator is set for this timer */ - -#define MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT 40 - -/* There is one of these for each timer */ -struct mmtimer { - struct rb_node list; - struct k_itimer *timer; - int cpu; -}; - -struct mmtimer_node { - spinlock_t lock ____cacheline_aligned; - struct rb_root timer_head; - struct rb_node *next; - struct tasklet_struct tasklet; -}; -static struct mmtimer_node *timers; - -static unsigned mmtimer_interval_retry_increment = - MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT; -module_param(mmtimer_interval_retry_increment, uint, 0644); -MODULE_PARM_DESC(mmtimer_interval_retry_increment, - "RTC ticks to add to expiration on interval retry (default 40)"); - -/* - * Add a new mmtimer struct to the node's mmtimer list. - * This function assumes the struct mmtimer_node is locked. - */ -static void mmtimer_add_list(struct mmtimer *n) -{ - int nodeid = n->timer->it.mmtimer.node; - unsigned long expires = n->timer->it.mmtimer.expires; - struct rb_node **link = &timers[nodeid].timer_head.rb_node; - struct rb_node *parent = NULL; - struct mmtimer *x; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - x = rb_entry(parent, struct mmtimer, list); - - if (expires < x->timer->it.mmtimer.expires) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &timers[nodeid].timer_head); - - if (!timers[nodeid].next || expires < rb_entry(timers[nodeid].next, - struct mmtimer, list)->timer->it.mmtimer.expires) - timers[nodeid].next = &n->list; -} - -/* - * Set the comparator for the next timer. - * This function assumes the struct mmtimer_node is locked. - */ -static void mmtimer_set_next_timer(int nodeid) -{ - struct mmtimer_node *n = &timers[nodeid]; - struct mmtimer *x; - struct k_itimer *t; - u64 expires, exp, set_completion_time; - int i; - -restart: - if (n->next == NULL) - return; - - x = rb_entry(n->next, struct mmtimer, list); - t = x->timer; - if (!t->it.mmtimer.incr) { - /* Not an interval timer */ - if (!mmtimer_setup(x->cpu, COMPARATOR, - t->it.mmtimer.expires, - &set_completion_time)) { - /* Late setup, fire now */ - tasklet_schedule(&n->tasklet); - } - return; - } - - /* Interval timer */ - i = 0; - expires = exp = t->it.mmtimer.expires; - while (!mmtimer_setup(x->cpu, COMPARATOR, expires, - &set_completion_time)) { - int to; - - i++; - expires = set_completion_time + - mmtimer_interval_retry_increment + (1 << i); - /* Calculate overruns as we go. */ - to = ((u64)(expires - exp) / t->it.mmtimer.incr); - if (to) { - t->it_overrun += to; - t->it.mmtimer.expires += t->it.mmtimer.incr * to; - exp = t->it.mmtimer.expires; - } - if (i > 20) { - printk(KERN_ALERT "mmtimer: cannot reschedule timer\n"); - t->it.mmtimer.clock = TIMER_OFF; - n->next = rb_next(&x->list); - rb_erase(&x->list, &n->timer_head); - kfree(x); - goto restart; - } - } -} - -/** - * mmtimer_ioctl - ioctl interface for /dev/mmtimer - * @file: file structure for the device - * @cmd: command to execute - * @arg: optional argument to command - * - * Executes the command specified by @cmd. Returns 0 for success, < 0 for - * failure. - * - * Valid commands: - * - * %MMTIMER_GETOFFSET - Should return the offset (relative to the start - * of the page where the registers are mapped) for the counter in question. - * - * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15) - * seconds - * - * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address - * specified by @arg - * - * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter - * - * %MMTIMER_MMAPAVAIL - Returns 1 if the registers can be mmap'd into userspace - * - * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it - * in the address specified by @arg. - */ -static long mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret = 0; - - mutex_lock(&mmtimer_mutex); - - switch (cmd) { - case MMTIMER_GETOFFSET: /* offset of the counter */ - /* - * SN RTC registers are on their own 64k page - */ - if(PAGE_SIZE <= (1 << 16)) - ret = (((long)RTC_COUNTER_ADDR) & (PAGE_SIZE-1)) / 8; - else - ret = -ENOSYS; - break; - - case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */ - if(copy_to_user((unsigned long __user *)arg, - &mmtimer_femtoperiod, sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETFREQ: /* frequency in Hz */ - if(copy_to_user((unsigned long __user *)arg, - &sn_rtc_cycles_per_second, - sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETBITS: /* number of bits in the clock */ - ret = RTC_BITS; - break; - - case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */ - ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0; - break; - - case MMTIMER_GETCOUNTER: - if(copy_to_user((unsigned long __user *)arg, - RTC_COUNTER_ADDR, sizeof(unsigned long))) - ret = -EFAULT; - break; - default: - ret = -ENOTTY; - break; - } - mutex_unlock(&mmtimer_mutex); - return ret; -} - -/** - * mmtimer_mmap - maps the clock's registers into userspace - * @file: file structure for the device - * @vma: VMA to map the registers into - * - * Calls remap_pfn_range() to map the clock's registers into - * the calling process' address space. - */ -static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma) -{ - unsigned long mmtimer_addr; - - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - if (vma->vm_flags & VM_WRITE) - return -EPERM; - - if (PAGE_SIZE > (1 << 16)) - return -ENOSYS; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - mmtimer_addr = __pa(RTC_COUNTER_ADDR); - mmtimer_addr &= ~(PAGE_SIZE - 1); - mmtimer_addr &= 0xfffffffffffffffUL; - - if (remap_pfn_range(vma, vma->vm_start, mmtimer_addr >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - printk(KERN_ERR "remap_pfn_range failed in mmtimer.c\n"); - return -EAGAIN; - } - - return 0; -} - -static struct miscdevice mmtimer_miscdev = { - .minor = SGI_MMTIMER, - .name = MMTIMER_NAME, - .fops = &mmtimer_fops -}; - -static struct timespec sgi_clock_offset; -static int sgi_clock_period; - -/* - * Posix Timer Interface - */ - -static struct timespec sgi_clock_offset; -static int sgi_clock_period; - -static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp) -{ - u64 nsec; - - nsec = rtc_time() * sgi_clock_period - + sgi_clock_offset.tv_nsec; - *tp = ns_to_timespec64(nsec); - tp->tv_sec += sgi_clock_offset.tv_sec; - return 0; -}; - -static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp) -{ - - u64 nsec; - u32 rem; - - nsec = rtc_time() * sgi_clock_period; - - sgi_clock_offset.tv_sec = tp->tv_sec - div_u64_rem(nsec, NSEC_PER_SEC, &rem); - - if (rem <= tp->tv_nsec) - sgi_clock_offset.tv_nsec = tp->tv_sec - rem; - else { - sgi_clock_offset.tv_nsec = tp->tv_sec + NSEC_PER_SEC - rem; - sgi_clock_offset.tv_sec--; - } - return 0; -} - -/** - * mmtimer_interrupt - timer interrupt handler - * @irq: irq received - * @dev_id: device the irq came from - * - * Called when one of the comarators matches the counter, This - * routine will send signals to processes that have requested - * them. - * - * This interrupt is run in an interrupt context - * by the SHUB. It is therefore safe to locally access SHub - * registers. - */ -static irqreturn_t -mmtimer_interrupt(int irq, void *dev_id) -{ - unsigned long expires = 0; - int result = IRQ_NONE; - unsigned indx = cpu_to_node(smp_processor_id()); - struct mmtimer *base; - - spin_lock(&timers[indx].lock); - base = rb_entry(timers[indx].next, struct mmtimer, list); - if (base == NULL) { - spin_unlock(&timers[indx].lock); - return result; - } - - if (base->cpu == smp_processor_id()) { - if (base->timer) - expires = base->timer->it.mmtimer.expires; - /* expires test won't work with shared irqs */ - if ((mmtimer_int_pending(COMPARATOR) > 0) || - (expires && (expires <= rtc_time()))) { - mmtimer_clr_int_pending(COMPARATOR); - tasklet_schedule(&timers[indx].tasklet); - result = IRQ_HANDLED; - } - } - spin_unlock(&timers[indx].lock); - return result; -} - -static void mmtimer_tasklet(unsigned long data) -{ - int nodeid = data; - struct mmtimer_node *mn = &timers[nodeid]; - struct mmtimer *x; - struct k_itimer *t; - unsigned long flags; - - /* Send signal and deal with periodic signals */ - spin_lock_irqsave(&mn->lock, flags); - if (!mn->next) - goto out; - - x = rb_entry(mn->next, struct mmtimer, list); - t = x->timer; - - if (t->it.mmtimer.clock == TIMER_OFF) - goto out; - - t->it_overrun = 0; - - mn->next = rb_next(&x->list); - rb_erase(&x->list, &mn->timer_head); - - if (posix_timer_event(t, 0) != 0) - t->it_overrun++; - - if(t->it.mmtimer.incr) { - t->it.mmtimer.expires += t->it.mmtimer.incr; - mmtimer_add_list(x); - } else { - /* Ensure we don't false trigger in mmtimer_interrupt */ - t->it.mmtimer.clock = TIMER_OFF; - t->it.mmtimer.expires = 0; - kfree(x); - } - /* Set comparator for next timer, if there is one */ - mmtimer_set_next_timer(nodeid); - - t->it_overrun_last = t->it_overrun; -out: - spin_unlock_irqrestore(&mn->lock, flags); -} - -static int sgi_timer_create(struct k_itimer *timer) -{ - /* Insure that a newly created timer is off */ - timer->it.mmtimer.clock = TIMER_OFF; - return 0; -} - -/* This does not really delete a timer. It just insures - * that the timer is not active - * - * Assumption: it_lock is already held with irq's disabled - */ -static int sgi_timer_del(struct k_itimer *timr) -{ - cnodeid_t nodeid = timr->it.mmtimer.node; - unsigned long irqflags; - - spin_lock_irqsave(&timers[nodeid].lock, irqflags); - if (timr->it.mmtimer.clock != TIMER_OFF) { - unsigned long expires = timr->it.mmtimer.expires; - struct rb_node *n = timers[nodeid].timer_head.rb_node; - struct mmtimer *uninitialized_var(t); - int r = 0; - - timr->it.mmtimer.clock = TIMER_OFF; - timr->it.mmtimer.expires = 0; - - while (n) { - t = rb_entry(n, struct mmtimer, list); - if (t->timer == timr) - break; - - if (expires < t->timer->it.mmtimer.expires) - n = n->rb_left; - else - n = n->rb_right; - } - - if (!n) { - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - return 0; - } - - if (timers[nodeid].next == n) { - timers[nodeid].next = rb_next(n); - r = 1; - } - - rb_erase(n, &timers[nodeid].timer_head); - kfree(t); - - if (r) { - mmtimer_disable_int(cnodeid_to_nasid(nodeid), - COMPARATOR); - mmtimer_set_next_timer(nodeid); - } - } - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - return 0; -} - -/* Assumption: it_lock is already held with irq's disabled */ -static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) -{ - - if (timr->it.mmtimer.clock == TIMER_OFF) { - cur_setting->it_interval.tv_nsec = 0; - cur_setting->it_interval.tv_sec = 0; - cur_setting->it_value.tv_nsec = 0; - cur_setting->it_value.tv_sec =0; - return; - } - - cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period); - cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period); -} - - -static int sgi_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) -{ - unsigned long when, period, irqflags; - int err = 0; - cnodeid_t nodeid; - struct mmtimer *base; - struct rb_node *n; - - if (old_setting) - sgi_timer_get(timr, old_setting); - - sgi_timer_del(timr); - when = timespec64_to_ns(&new_setting->it_value); - period = timespec64_to_ns(&new_setting->it_interval); - - if (when == 0) - /* Clear timer */ - return 0; - - base = kmalloc(sizeof(struct mmtimer), GFP_KERNEL); - if (base == NULL) - return -ENOMEM; - - if (flags & TIMER_ABSTIME) { - struct timespec64 n; - unsigned long now; - - getnstimeofday64(&n); - now = timespec64_to_ns(&n); - if (when > now) - when -= now; - else - /* Fire the timer immediately */ - when = 0; - } - - /* - * Convert to sgi clock period. Need to keep rtc_time() as near as possible - * to getnstimeofday() in order to be as faithful as possible to the time - * specified. - */ - when = (when + sgi_clock_period - 1) / sgi_clock_period + rtc_time(); - period = (period + sgi_clock_period - 1) / sgi_clock_period; - - /* - * We are allocating a local SHub comparator. If we would be moved to another - * cpu then another SHub may be local to us. Prohibit that by switching off - * preemption. - */ - preempt_disable(); - - nodeid = cpu_to_node(smp_processor_id()); - - /* Lock the node timer structure */ - spin_lock_irqsave(&timers[nodeid].lock, irqflags); - - base->timer = timr; - base->cpu = smp_processor_id(); - - timr->it.mmtimer.clock = TIMER_SET; - timr->it.mmtimer.node = nodeid; - timr->it.mmtimer.incr = period; - timr->it.mmtimer.expires = when; - - n = timers[nodeid].next; - - /* Add the new struct mmtimer to node's timer list */ - mmtimer_add_list(base); - - if (timers[nodeid].next == n) { - /* No need to reprogram comparator for now */ - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - preempt_enable(); - return err; - } - - /* We need to reprogram the comparator */ - if (n) - mmtimer_disable_int(cnodeid_to_nasid(nodeid), COMPARATOR); - - mmtimer_set_next_timer(nodeid); - - /* Unlock the node timer structure */ - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - - preempt_enable(); - - return err; -} - -static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = sgi_clock_period; - return 0; -} - -static struct k_clock sgi_clock = { - .clock_set = sgi_clock_set, - .clock_get = sgi_clock_get, - .clock_getres = sgi_clock_getres, - .timer_create = sgi_timer_create, - .timer_set = sgi_timer_set, - .timer_del = sgi_timer_del, - .timer_get = sgi_timer_get -}; - -/** - * mmtimer_init - device initialization routine - * - * Does initial setup for the mmtimer device. - */ -static int __init mmtimer_init(void) -{ - cnodeid_t node, maxn = -1; - - if (!ia64_platform_is("sn2")) - return 0; - - /* - * Sanity check the cycles/sec variable - */ - if (sn_rtc_cycles_per_second < 100000) { - printk(KERN_ERR "%s: unable to determine clock frequency\n", - MMTIMER_NAME); - goto out1; - } - - mmtimer_femtoperiod = ((unsigned long)1E15 + sn_rtc_cycles_per_second / - 2) / sn_rtc_cycles_per_second; - - if (request_irq(SGI_MMTIMER_VECTOR, mmtimer_interrupt, IRQF_PERCPU, MMTIMER_NAME, NULL)) { - printk(KERN_WARNING "%s: unable to allocate interrupt.", - MMTIMER_NAME); - goto out1; - } - - if (misc_register(&mmtimer_miscdev)) { - printk(KERN_ERR "%s: failed to register device\n", - MMTIMER_NAME); - goto out2; - } - - /* Get max numbered node, calculate slots needed */ - for_each_online_node(node) { - maxn = node; - } - maxn++; - - /* Allocate list of node ptrs to mmtimer_t's */ - timers = kzalloc(sizeof(struct mmtimer_node)*maxn, GFP_KERNEL); - if (!timers) { - printk(KERN_ERR "%s: failed to allocate memory for device\n", - MMTIMER_NAME); - goto out3; - } - - /* Initialize struct mmtimer's for each online node */ - for_each_online_node(node) { - spin_lock_init(&timers[node].lock); - tasklet_init(&timers[node].tasklet, mmtimer_tasklet, - (unsigned long) node); - } - - sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second; - posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock); - - printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, - sn_rtc_cycles_per_second/(unsigned long)1E6); - - return 0; - -out3: - misc_deregister(&mmtimer_miscdev); -out2: - free_irq(SGI_MMTIMER_VECTOR, NULL); -out1: - return -1; -} - -module_init(mmtimer_init); diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index a1e4fc622569..abcb655f0209 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -1,22 +1,16 @@ menu "Clock Source drivers" depends on !ARCH_USES_GETTIMEOFFSET -config CLKSRC_OF +config TIMER_OF bool - select CLKSRC_PROBE - -config CLKEVT_OF - bool - select CLKEVT_PROBE - -config CLKSRC_ACPI - bool - select CLKSRC_PROBE + depends on GENERIC_CLOCKEVENTS + select TIMER_PROBE -config CLKSRC_PROBE +config TIMER_ACPI bool + select TIMER_PROBE -config CLKEVT_PROBE +config TIMER_PROBE bool config CLKSRC_I8253 @@ -65,14 +59,14 @@ config DW_APB_TIMER config DW_APB_TIMER_OF bool select DW_APB_TIMER - select CLKSRC_OF + select TIMER_OF config FTTMR010_TIMER bool "Faraday Technology timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS depends on HAS_IOMEM select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF select MFD_SYSCON help Enables support for the Faraday Technology timer block @@ -81,7 +75,7 @@ config FTTMR010_TIMER config ROCKCHIP_TIMER bool "Rockchip timer driver" if COMPILE_TEST depends on ARM || ARM64 - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help Enables the support for the rockchip timer driver. @@ -89,7 +83,7 @@ config ROCKCHIP_TIMER config ARMADA_370_XP_TIMER bool "Armada 370 and XP timer driver" if COMPILE_TEST depends on ARM - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help Enables the support for the Armada 370 and XP timer driver. @@ -104,7 +98,7 @@ config MESON6_TIMER config ORION_TIMER bool "Orion timer driver" if COMPILE_TEST depends on ARM - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help Enables the support for the Orion timer driver @@ -155,7 +149,7 @@ config ASM9260_TIMER bool "ASM9260 timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF help Enables support for the ASM9260 timer. @@ -195,13 +189,6 @@ config ATLAS7_TIMER help Enables support for the Atlas7 timer. -config MOXART_TIMER - bool "Moxart timer driver" if COMPILE_TEST - depends on GENERIC_CLOCKEVENTS - select CLKSRC_MMIO - help - Enables support for the Moxart timer. - config MXS_TIMER bool "Mxs timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS @@ -268,21 +255,21 @@ config CLKSRC_LPC32XX depends on GENERIC_CLOCKEVENTS && HAS_IOMEM depends on ARM select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF help Support for the LPC32XX clocksource. config CLKSRC_PISTACHIO bool "Clocksource for Pistachio SoC" if COMPILE_TEST depends on HAS_IOMEM - select CLKSRC_OF + select TIMER_OF help Enables the clocksource for the Pistachio SoC. config CLKSRC_TI_32K bool "Texas Instruments 32.768 Hz Clocksource" if COMPILE_TEST depends on GENERIC_SCHED_CLOCK - select CLKSRC_OF if OF + select TIMER_OF if OF help This option enables support for Texas Instruments 32.768 Hz clocksource available on many OMAP-like platforms. @@ -291,7 +278,7 @@ config CLKSRC_NPS bool "NPS400 clocksource driver" if COMPILE_TEST depends on !PHYS_ADDR_T_64BIT select CLKSRC_MMIO - select CLKSRC_OF if OF + select TIMER_OF if OF help NPS400 clocksource support. Got 64 bit counter with update rate up to 1000MHz. @@ -306,12 +293,12 @@ config CLKSRC_MPS2 bool "Clocksource for MPS2 SoCs" if COMPILE_TEST depends on GENERIC_SCHED_CLOCK select CLKSRC_MMIO - select CLKSRC_OF + select TIMER_OF config ARC_TIMERS bool "Support for 32-bit TIMERn counters in ARC Cores" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS - select CLKSRC_OF + select TIMER_OF help These are legacy 32-bit TIMER0 and TIMER1 counters found on all ARC cores (ARC700 as well as ARC HS38). @@ -321,7 +308,7 @@ config ARC_TIMERS_64BIT bool "Support for 64-bit counters in ARC HS38 cores" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS depends on ARC_TIMERS - select CLKSRC_OF + select TIMER_OF help This enables 2 different 64-bit timers: RTC (for UP) and GFRC (for SMP) RTC is implemented inside the core, while GFRC sits outside the core in @@ -330,8 +317,8 @@ config ARC_TIMERS_64BIT config ARM_ARCH_TIMER bool - select CLKSRC_OF if OF - select CLKSRC_ACPI if ACPI + select TIMER_OF if OF + select TIMER_ACPI if ACPI config ARM_ARCH_TIMER_EVTSTREAM bool "Enable ARM architected timer event stream generation by default" @@ -388,7 +375,7 @@ config ARM64_ERRATUM_858921 config ARM_GLOBAL_TIMER bool "Support for the ARM global timer" if COMPILE_TEST - select CLKSRC_OF if OF + select TIMER_OF if OF depends on ARM help This options enables support for the ARM global timer unit @@ -397,7 +384,7 @@ config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" depends on GENERIC_SCHED_CLOCK && CLKDEV_LOOKUP select CLKSRC_MMIO - select CLKSRC_OF if OF + select TIMER_OF if OF config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK bool @@ -408,19 +395,19 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK config ARMV7M_SYSTICK bool "Support for the ARMv7M system time" if COMPILE_TEST - select CLKSRC_OF if OF + select TIMER_OF if OF select CLKSRC_MMIO help This options enables support for the ARMv7M system timer unit config ATMEL_PIT - select CLKSRC_OF if OF + select TIMER_OF if OF def_bool SOC_AT91SAM9 || SOC_SAMA5 config ATMEL_ST bool "Atmel ST timer support" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS - select CLKSRC_OF + select TIMER_OF select MFD_SYSCON help Support for the Atmel ST timer. @@ -463,7 +450,7 @@ config VF_PIT_TIMER config OXNAS_RPS_TIMER bool "Oxford Semiconductor OXNAS RPS Timers driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help This enables support for the Oxford Semiconductor OXNAS RPS timers. @@ -474,7 +461,7 @@ config SYS_SUPPORTS_SH_CMT config MTK_TIMER bool "Mediatek timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS && HAS_IOMEM - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help Support for Mediatek timer driver. @@ -547,7 +534,7 @@ config EM_TIMER_STI config CLKSRC_QCOM bool "Qualcomm MSM timer" if COMPILE_TEST depends on ARM - select CLKSRC_OF + select TIMER_OF help This enables the clocksource and the per CPU clockevent driver for the Qualcomm SoCs. @@ -555,7 +542,7 @@ config CLKSRC_QCOM config CLKSRC_VERSATILE bool "ARM Versatile (Express) reference platforms clock source" if COMPILE_TEST depends on GENERIC_SCHED_CLOCK && !ARCH_USES_GETTIMEOFFSET - select CLKSRC_OF + select TIMER_OF default y if MFD_VEXPRESS_SYSREG help This option enables clock source based on free running @@ -566,12 +553,12 @@ config CLKSRC_VERSATILE config CLKSRC_MIPS_GIC bool depends on MIPS_GIC - select CLKSRC_OF + select TIMER_OF config CLKSRC_TANGO_XTAL bool "Clocksource for Tango SoC" if COMPILE_TEST depends on ARM - select CLKSRC_OF + select TIMER_OF select CLKSRC_MMIO help This enables the clocksource for Tango SoC @@ -612,7 +599,7 @@ config CLKSRC_IMX_GPT config CLKSRC_ST_LPC bool "Low power clocksource found in the LPC" if COMPILE_TEST - select CLKSRC_OF if OF + select TIMER_OF if OF depends on HAS_IOMEM select CLKSRC_MMIO help diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 9cd12486483c..6df949402dfc 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -1,5 +1,5 @@ -obj-$(CONFIG_CLKSRC_PROBE) += clksrc-probe.o -obj-$(CONFIG_CLKEVT_PROBE) += clkevt-probe.o +obj-$(CONFIG_TIMER_OF) += timer-of.o +obj-$(CONFIG_TIMER_PROBE) += timer-probe.o obj-$(CONFIG_ATMEL_PIT) += timer-atmel-pit.o obj-$(CONFIG_ATMEL_ST) += timer-atmel-st.o obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o @@ -26,7 +26,6 @@ obj-$(CONFIG_ORION_TIMER) += time-orion.o obj-$(CONFIG_BCM2835_TIMER) += bcm2835_timer.o obj-$(CONFIG_CLPS711X_TIMER) += clps711x-timer.o obj-$(CONFIG_ATLAS7_TIMER) += timer-atlas7.o -obj-$(CONFIG_MOXART_TIMER) += moxart_timer.o obj-$(CONFIG_MXS_TIMER) += mxs_timer.o obj-$(CONFIG_CLKSRC_PXA) += pxa_timer.o obj-$(CONFIG_PRIMA2_TIMER) += timer-prima2.o diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c index 21649733827d..4927355f9cbe 100644 --- a/drivers/clocksource/arc_timer.c +++ b/drivers/clocksource/arc_timer.c @@ -99,7 +99,7 @@ static int __init arc_cs_setup_gfrc(struct device_node *node) return clocksource_register_hz(&arc_counter_gfrc, arc_timer_freq); } -CLOCKSOURCE_OF_DECLARE(arc_gfrc, "snps,archs-timer-gfrc", arc_cs_setup_gfrc); +TIMER_OF_DECLARE(arc_gfrc, "snps,archs-timer-gfrc", arc_cs_setup_gfrc); #define AUX_RTC_CTRL 0x103 #define AUX_RTC_LOW 0x104 @@ -158,7 +158,7 @@ static int __init arc_cs_setup_rtc(struct device_node *node) return clocksource_register_hz(&arc_counter_rtc, arc_timer_freq); } -CLOCKSOURCE_OF_DECLARE(arc_rtc, "snps,archs-timer-rtc", arc_cs_setup_rtc); +TIMER_OF_DECLARE(arc_rtc, "snps,archs-timer-rtc", arc_cs_setup_rtc); #endif @@ -333,4 +333,4 @@ static int __init arc_of_timer_init(struct device_node *np) return ret; } -CLOCKSOURCE_OF_DECLARE(arc_clkevt, "snps,arc-timer", arc_of_timer_init); +TIMER_OF_DECLARE(arc_clkevt, "snps,arc-timer", arc_of_timer_init); diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 8b5c30062d99..aae87c4c546e 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1194,8 +1194,8 @@ static int __init arch_timer_of_init(struct device_node *np) return arch_timer_common_init(); } -CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); -CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); +TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); +TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); static u32 __init arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) @@ -1382,7 +1382,7 @@ out: kfree(timer_mem); return ret; } -CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", +TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_of_init); #ifdef CONFIG_ACPI_GTDT @@ -1516,5 +1516,5 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) return arch_timer_common_init(); } -CLOCKSOURCE_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); +TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); #endif diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 123ed20ac2ff..095bb965f621 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -339,5 +339,5 @@ out_unmap: } /* Only tested on r2p2 and r3p0 */ -CLOCKSOURCE_OF_DECLARE(arm_gt, "arm,cortex-a9-global-timer", +TIMER_OF_DECLARE(arm_gt, "arm,cortex-a9-global-timer", global_timer_of_register); diff --git a/drivers/clocksource/armv7m_systick.c b/drivers/clocksource/armv7m_systick.c index a315491b7047..ac046d6fb0bf 100644 --- a/drivers/clocksource/armv7m_systick.c +++ b/drivers/clocksource/armv7m_systick.c @@ -82,5 +82,5 @@ out_unmap: return ret; } -CLOCKSOURCE_OF_DECLARE(arm_systick, "arm,armv7m-systick", +TIMER_OF_DECLARE(arm_systick, "arm,armv7m-systick", system_timer_of_register); diff --git a/drivers/clocksource/asm9260_timer.c b/drivers/clocksource/asm9260_timer.c index c6780830b8ac..38cd2feb87c4 100644 --- a/drivers/clocksource/asm9260_timer.c +++ b/drivers/clocksource/asm9260_timer.c @@ -238,5 +238,5 @@ static int __init asm9260_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(asm9260_timer, "alphascale,asm9260-timer", +TIMER_OF_DECLARE(asm9260_timer, "alphascale,asm9260-timer", asm9260_timer_init); diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c index dce44307469e..82828d3a4739 100644 --- a/drivers/clocksource/bcm2835_timer.c +++ b/drivers/clocksource/bcm2835_timer.c @@ -148,5 +148,5 @@ err_iounmap: iounmap(base); return ret; } -CLOCKSOURCE_OF_DECLARE(bcm2835, "brcm,bcm2835-system-timer", +TIMER_OF_DECLARE(bcm2835, "brcm,bcm2835-system-timer", bcm2835_timer_init); diff --git a/drivers/clocksource/bcm_kona_timer.c b/drivers/clocksource/bcm_kona_timer.c index fda5e1476638..5c40be9880f5 100644 --- a/drivers/clocksource/bcm_kona_timer.c +++ b/drivers/clocksource/bcm_kona_timer.c @@ -198,9 +198,9 @@ static int __init kona_timer_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(brcm_kona, "brcm,kona-timer", kona_timer_init); +TIMER_OF_DECLARE(brcm_kona, "brcm,kona-timer", kona_timer_init); /* * bcm,kona-timer is deprecated by brcm,kona-timer * being kept here for driver compatibility */ -CLOCKSOURCE_OF_DECLARE(bcm_kona, "bcm,kona-timer", kona_timer_init); +TIMER_OF_DECLARE(bcm_kona, "bcm,kona-timer", kona_timer_init); diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c index 8e64b8460f11..29d51755e18b 100644 --- a/drivers/clocksource/cadence_ttc_timer.c +++ b/drivers/clocksource/cadence_ttc_timer.c @@ -540,4 +540,4 @@ static int __init ttc_timer_init(struct device_node *timer) return 0; } -CLOCKSOURCE_OF_DECLARE(ttc, "cdns,ttc", ttc_timer_init); +TIMER_OF_DECLARE(ttc, "cdns,ttc", ttc_timer_init); diff --git a/drivers/clocksource/clkevt-probe.c b/drivers/clocksource/clkevt-probe.c deleted file mode 100644 index eb89b502acbd..000000000000 --- a/drivers/clocksource/clkevt-probe.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, Linaro Ltd. All rights reserved. - * Daniel Lezcano <daniel.lezcano@linaro.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/init.h> -#include <linux/of.h> -#include <linux/clockchips.h> - -extern struct of_device_id __clkevt_of_table[]; - -static const struct of_device_id __clkevt_of_table_sentinel - __used __section(__clkevt_of_table_end); - -int __init clockevent_probe(void) -{ - struct device_node *np; - const struct of_device_id *match; - of_init_fn_1_ret init_func; - int ret, clockevents = 0; - - for_each_matching_node_and_match(np, __clkevt_of_table, &match) { - if (!of_device_is_available(np)) - continue; - - init_func = match->data; - - ret = init_func(np); - if (ret) { - pr_warn("Failed to initialize '%s' (%d)\n", - np->name, ret); - continue; - } - - clockevents++; - } - - if (!clockevents) { - pr_crit("%s: no matching clockevent found\n", __func__); - return -ENODEV; - } - - return 0; -} diff --git a/drivers/clocksource/clksrc-dbx500-prcmu.c b/drivers/clocksource/clksrc-dbx500-prcmu.c index c69e2772658d..c1b96dc5f444 100644 --- a/drivers/clocksource/clksrc-dbx500-prcmu.c +++ b/drivers/clocksource/clksrc-dbx500-prcmu.c @@ -86,5 +86,5 @@ static int __init clksrc_dbx500_prcmu_init(struct device_node *node) #endif return clocksource_register_hz(&clocksource_dbx500_prcmu, RATE_32K); } -CLOCKSOURCE_OF_DECLARE(dbx500_prcmu, "stericsson,db8500-prcmu-timer-4", +TIMER_OF_DECLARE(dbx500_prcmu, "stericsson,db8500-prcmu-timer-4", clksrc_dbx500_prcmu_init); diff --git a/drivers/clocksource/clksrc_st_lpc.c b/drivers/clocksource/clksrc_st_lpc.c index 03cc49217bb4..a1d01ebb81f5 100644 --- a/drivers/clocksource/clksrc_st_lpc.c +++ b/drivers/clocksource/clksrc_st_lpc.c @@ -132,4 +132,4 @@ static int __init st_clksrc_of_register(struct device_node *np) return ret; } -CLOCKSOURCE_OF_DECLARE(ddata, "st,stih407-lpc", st_clksrc_of_register); +TIMER_OF_DECLARE(ddata, "st,stih407-lpc", st_clksrc_of_register); diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index 24db6d605549..a8dd80576c95 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -103,7 +103,7 @@ void __init clps711x_clksrc_init(void __iomem *tc1_base, void __iomem *tc2_base, BUG_ON(_clps711x_clkevt_init(tc2, tc2_base, irq)); } -#ifdef CONFIG_CLKSRC_OF +#ifdef CONFIG_TIMER_OF static int __init clps711x_timer_init(struct device_node *np) { unsigned int irq = irq_of_parse_and_map(np, 0); @@ -119,5 +119,5 @@ static int __init clps711x_timer_init(struct device_node *np) return -EINVAL; } } -CLOCKSOURCE_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); +TIMER_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); #endif diff --git a/drivers/clocksource/dw_apb_timer_of.c b/drivers/clocksource/dw_apb_timer_of.c index aee6c0d39a7c..69866cd8f4bb 100644 --- a/drivers/clocksource/dw_apb_timer_of.c +++ b/drivers/clocksource/dw_apb_timer_of.c @@ -167,7 +167,7 @@ static int __init dw_apb_timer_init(struct device_node *timer) return 0; } -CLOCKSOURCE_OF_DECLARE(pc3x2_timer, "picochip,pc3x2-timer", dw_apb_timer_init); -CLOCKSOURCE_OF_DECLARE(apb_timer_osc, "snps,dw-apb-timer-osc", dw_apb_timer_init); -CLOCKSOURCE_OF_DECLARE(apb_timer_sp, "snps,dw-apb-timer-sp", dw_apb_timer_init); -CLOCKSOURCE_OF_DECLARE(apb_timer, "snps,dw-apb-timer", dw_apb_timer_init); +TIMER_OF_DECLARE(pc3x2_timer, "picochip,pc3x2-timer", dw_apb_timer_init); +TIMER_OF_DECLARE(apb_timer_osc, "snps,dw-apb-timer-osc", dw_apb_timer_init); +TIMER_OF_DECLARE(apb_timer_sp, "snps,dw-apb-timer-sp", dw_apb_timer_init); +TIMER_OF_DECLARE(apb_timer, "snps,dw-apb-timer", dw_apb_timer_init); diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 670ff0f25b67..7a244b681876 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -610,5 +610,5 @@ static int __init mct_init_ppi(struct device_node *np) { return mct_init_dt(np, MCT_INT_PPI); } -CLOCKSOURCE_OF_DECLARE(exynos4210, "samsung,exynos4210-mct", mct_init_spi); -CLOCKSOURCE_OF_DECLARE(exynos4412, "samsung,exynos4412-mct", mct_init_ppi); +TIMER_OF_DECLARE(exynos4210, "samsung,exynos4210-mct", mct_init_spi); +TIMER_OF_DECLARE(exynos4412, "samsung,exynos4412-mct", mct_init_ppi); diff --git a/drivers/clocksource/fsl_ftm_timer.c b/drivers/clocksource/fsl_ftm_timer.c index 738515b89073..3121e2d96c91 100644 --- a/drivers/clocksource/fsl_ftm_timer.c +++ b/drivers/clocksource/fsl_ftm_timer.c @@ -369,4 +369,4 @@ err: kfree(priv); return ret; } -CLOCKSOURCE_OF_DECLARE(flextimer, "fsl,ftm-timer", ftm_timer_init); +TIMER_OF_DECLARE(flextimer, "fsl,ftm-timer", ftm_timer_init); diff --git a/drivers/clocksource/h8300_timer16.c b/drivers/clocksource/h8300_timer16.c index 5b27fb9997c2..dfbd4f8051cb 100644 --- a/drivers/clocksource/h8300_timer16.c +++ b/drivers/clocksource/h8300_timer16.c @@ -187,5 +187,5 @@ free_clk: return ret; } -CLOCKSOURCE_OF_DECLARE(h8300_16bit, "renesas,16bit-timer", +TIMER_OF_DECLARE(h8300_16bit, "renesas,16bit-timer", h8300_16timer_init); diff --git a/drivers/clocksource/h8300_timer8.c b/drivers/clocksource/h8300_timer8.c index 804c489531d6..f6ffb0cef091 100644 --- a/drivers/clocksource/h8300_timer8.c +++ b/drivers/clocksource/h8300_timer8.c @@ -207,4 +207,4 @@ free_clk: return ret; } -CLOCKSOURCE_OF_DECLARE(h8300_8bit, "renesas,8bit-timer", h8300_8timer_init); +TIMER_OF_DECLARE(h8300_8bit, "renesas,8bit-timer", h8300_8timer_init); diff --git a/drivers/clocksource/h8300_tpu.c b/drivers/clocksource/h8300_tpu.c index 72e1cf2b3096..45a8d17dac1e 100644 --- a/drivers/clocksource/h8300_tpu.c +++ b/drivers/clocksource/h8300_tpu.c @@ -154,4 +154,4 @@ free_clk: return ret; } -CLOCKSOURCE_OF_DECLARE(h8300_tpu, "renesas,tpu", h8300_tpu_init); +TIMER_OF_DECLARE(h8300_tpu, "renesas,tpu", h8300_tpu_init); diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c index 7c61226f4359..5d3d88e0fc8c 100644 --- a/drivers/clocksource/jcore-pit.c +++ b/drivers/clocksource/jcore-pit.c @@ -246,4 +246,4 @@ static int __init jcore_pit_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(jcore_pit, "jcore,pit", jcore_pit_init); +TIMER_OF_DECLARE(jcore_pit, "jcore,pit", jcore_pit_init); diff --git a/drivers/clocksource/meson6_timer.c b/drivers/clocksource/meson6_timer.c index 39d21f693a33..92f20991a937 100644 --- a/drivers/clocksource/meson6_timer.c +++ b/drivers/clocksource/meson6_timer.c @@ -174,5 +174,5 @@ static int __init meson6_timer_init(struct device_node *node) 1, 0xfffe); return 0; } -CLOCKSOURCE_OF_DECLARE(meson6, "amlogic,meson6-timer", +TIMER_OF_DECLARE(meson6, "amlogic,meson6-timer", meson6_timer_init); diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index 3f52ee219923..e31e08326024 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -200,5 +200,5 @@ static int __init gic_clocksource_of_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(mips_gic_timer, "mti,gic-timer", +TIMER_OF_DECLARE(mips_gic_timer, "mti,gic-timer", gic_clocksource_of_init); diff --git a/drivers/clocksource/moxart_timer.c b/drivers/clocksource/moxart_timer.c deleted file mode 100644 index 7f3430654fbd..000000000000 --- a/drivers/clocksource/moxart_timer.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * MOXA ART SoCs timer handling. - * - * Copyright (C) 2013 Jonas Jensen - * - * Jonas Jensen <jonas.jensen@gmail.com> - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#include <linux/clk.h> -#include <linux/clockchips.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/irqreturn.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <linux/of_irq.h> -#include <linux/io.h> -#include <linux/clocksource.h> -#include <linux/bitops.h> -#include <linux/slab.h> - -#define TIMER1_BASE 0x00 -#define TIMER2_BASE 0x10 -#define TIMER3_BASE 0x20 - -#define REG_COUNT 0x0 /* writable */ -#define REG_LOAD 0x4 -#define REG_MATCH1 0x8 -#define REG_MATCH2 0xC - -#define TIMER_CR 0x30 -#define TIMER_INTR_STATE 0x34 -#define TIMER_INTR_MASK 0x38 - -/* - * Moxart TIMER_CR flags: - * - * MOXART_CR_*_CLOCK 0: PCLK, 1: EXT1CLK - * MOXART_CR_*_INT overflow interrupt enable bit - */ -#define MOXART_CR_1_ENABLE BIT(0) -#define MOXART_CR_1_CLOCK BIT(1) -#define MOXART_CR_1_INT BIT(2) -#define MOXART_CR_2_ENABLE BIT(3) -#define MOXART_CR_2_CLOCK BIT(4) -#define MOXART_CR_2_INT BIT(5) -#define MOXART_CR_3_ENABLE BIT(6) -#define MOXART_CR_3_CLOCK BIT(7) -#define MOXART_CR_3_INT BIT(8) -#define MOXART_CR_COUNT_UP BIT(9) - -#define MOXART_TIMER1_ENABLE (MOXART_CR_2_ENABLE | MOXART_CR_1_ENABLE) -#define MOXART_TIMER1_DISABLE (MOXART_CR_2_ENABLE) - -/* - * The ASpeed variant of the IP block has a different layout - * for the control register - */ -#define ASPEED_CR_1_ENABLE BIT(0) -#define ASPEED_CR_1_CLOCK BIT(1) -#define ASPEED_CR_1_INT BIT(2) -#define ASPEED_CR_2_ENABLE BIT(4) -#define ASPEED_CR_2_CLOCK BIT(5) -#define ASPEED_CR_2_INT BIT(6) -#define ASPEED_CR_3_ENABLE BIT(8) -#define ASPEED_CR_3_CLOCK BIT(9) -#define ASPEED_CR_3_INT BIT(10) - -#define ASPEED_TIMER1_ENABLE (ASPEED_CR_2_ENABLE | ASPEED_CR_1_ENABLE) -#define ASPEED_TIMER1_DISABLE (ASPEED_CR_2_ENABLE) - -struct moxart_timer { - void __iomem *base; - unsigned int t1_disable_val; - unsigned int t1_enable_val; - unsigned int count_per_tick; - struct clock_event_device clkevt; -}; - -static inline struct moxart_timer *to_moxart(struct clock_event_device *evt) -{ - return container_of(evt, struct moxart_timer, clkevt); -} - -static inline void moxart_disable(struct clock_event_device *evt) -{ - struct moxart_timer *timer = to_moxart(evt); - - writel(timer->t1_disable_val, timer->base + TIMER_CR); -} - -static inline void moxart_enable(struct clock_event_device *evt) -{ - struct moxart_timer *timer = to_moxart(evt); - - writel(timer->t1_enable_val, timer->base + TIMER_CR); -} - -static int moxart_shutdown(struct clock_event_device *evt) -{ - moxart_disable(evt); - return 0; -} - -static int moxart_set_oneshot(struct clock_event_device *evt) -{ - moxart_disable(evt); - writel(~0, to_moxart(evt)->base + TIMER1_BASE + REG_LOAD); - return 0; -} - -static int moxart_set_periodic(struct clock_event_device *evt) -{ - struct moxart_timer *timer = to_moxart(evt); - - moxart_disable(evt); - writel(timer->count_per_tick, timer->base + TIMER1_BASE + REG_LOAD); - writel(0, timer->base + TIMER1_BASE + REG_MATCH1); - moxart_enable(evt); - return 0; -} - -static int moxart_clkevt_next_event(unsigned long cycles, - struct clock_event_device *evt) -{ - struct moxart_timer *timer = to_moxart(evt); - u32 u; - - moxart_disable(evt); - - u = readl(timer->base + TIMER1_BASE + REG_COUNT) - cycles; - writel(u, timer->base + TIMER1_BASE + REG_MATCH1); - - moxart_enable(evt); - - return 0; -} - -static irqreturn_t moxart_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - evt->event_handler(evt); - return IRQ_HANDLED; -} - -static int __init moxart_timer_init(struct device_node *node) -{ - int ret, irq; - unsigned long pclk; - struct clk *clk; - struct moxart_timer *timer; - - timer = kzalloc(sizeof(*timer), GFP_KERNEL); - if (!timer) - return -ENOMEM; - - timer->base = of_iomap(node, 0); - if (!timer->base) { - pr_err("%s: of_iomap failed\n", node->full_name); - ret = -ENXIO; - goto out_free; - } - - irq = irq_of_parse_and_map(node, 0); - if (irq <= 0) { - pr_err("%s: irq_of_parse_and_map failed\n", node->full_name); - ret = -EINVAL; - goto out_unmap; - } - - clk = of_clk_get(node, 0); - if (IS_ERR(clk)) { - pr_err("%s: of_clk_get failed\n", node->full_name); - ret = PTR_ERR(clk); - goto out_unmap; - } - - pclk = clk_get_rate(clk); - - if (of_device_is_compatible(node, "moxa,moxart-timer")) { - timer->t1_enable_val = MOXART_TIMER1_ENABLE; - timer->t1_disable_val = MOXART_TIMER1_DISABLE; - } else if (of_device_is_compatible(node, "aspeed,ast2400-timer")) { - timer->t1_enable_val = ASPEED_TIMER1_ENABLE; - timer->t1_disable_val = ASPEED_TIMER1_DISABLE; - } else { - pr_err("%s: unknown platform\n", node->full_name); - ret = -EINVAL; - goto out_unmap; - } - - timer->count_per_tick = DIV_ROUND_CLOSEST(pclk, HZ); - - timer->clkevt.name = node->name; - timer->clkevt.rating = 200; - timer->clkevt.features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT; - timer->clkevt.set_state_shutdown = moxart_shutdown; - timer->clkevt.set_state_periodic = moxart_set_periodic; - timer->clkevt.set_state_oneshot = moxart_set_oneshot; - timer->clkevt.tick_resume = moxart_set_oneshot; - timer->clkevt.set_next_event = moxart_clkevt_next_event; - timer->clkevt.cpumask = cpumask_of(0); - timer->clkevt.irq = irq; - - ret = clocksource_mmio_init(timer->base + TIMER2_BASE + REG_COUNT, - "moxart_timer", pclk, 200, 32, - clocksource_mmio_readl_down); - if (ret) { - pr_err("%s: clocksource_mmio_init failed\n", node->full_name); - goto out_unmap; - } - - ret = request_irq(irq, moxart_timer_interrupt, IRQF_TIMER, - node->name, &timer->clkevt); - if (ret) { - pr_err("%s: setup_irq failed\n", node->full_name); - goto out_unmap; - } - - /* Clear match registers */ - writel(0, timer->base + TIMER1_BASE + REG_MATCH1); - writel(0, timer->base + TIMER1_BASE + REG_MATCH2); - writel(0, timer->base + TIMER2_BASE + REG_MATCH1); - writel(0, timer->base + TIMER2_BASE + REG_MATCH2); - - /* - * Start timer 2 rolling as our main wall clock source, keep timer 1 - * disabled - */ - writel(0, timer->base + TIMER_CR); - writel(~0, timer->base + TIMER2_BASE + REG_LOAD); - writel(timer->t1_disable_val, timer->base + TIMER_CR); - - /* - * documentation is not publicly available: - * min_delta / max_delta obtained by trial-and-error, - * max_delta 0xfffffffe should be ok because count - * register size is u32 - */ - clockevents_config_and_register(&timer->clkevt, pclk, 0x4, 0xfffffffe); - - return 0; - -out_unmap: - iounmap(timer->base); -out_free: - kfree(timer); - return ret; -} -CLOCKSOURCE_OF_DECLARE(moxart, "moxa,moxart-timer", moxart_timer_init); -CLOCKSOURCE_OF_DECLARE(aspeed, "aspeed,ast2400-timer", moxart_timer_init); diff --git a/drivers/clocksource/mps2-timer.c b/drivers/clocksource/mps2-timer.c index 3e4431ed9aa9..aa4d63af8706 100644 --- a/drivers/clocksource/mps2-timer.c +++ b/drivers/clocksource/mps2-timer.c @@ -274,4 +274,4 @@ static int __init mps2_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(mps2_timer, "arm,mps2-timer", mps2_timer_init); +TIMER_OF_DECLARE(mps2_timer, "arm,mps2-timer", mps2_timer_init); diff --git a/drivers/clocksource/mtk_timer.c b/drivers/clocksource/mtk_timer.c index 90659493c59c..f9b724fd9950 100644 --- a/drivers/clocksource/mtk_timer.c +++ b/drivers/clocksource/mtk_timer.c @@ -265,4 +265,4 @@ err_kzalloc: return -EINVAL; } -CLOCKSOURCE_OF_DECLARE(mtk_mt6577, "mediatek,mt6577-timer", mtk_timer_init); +TIMER_OF_DECLARE(mtk_mt6577, "mediatek,mt6577-timer", mtk_timer_init); diff --git a/drivers/clocksource/mxs_timer.c b/drivers/clocksource/mxs_timer.c index 99b77aff0839..a03434e9fe8f 100644 --- a/drivers/clocksource/mxs_timer.c +++ b/drivers/clocksource/mxs_timer.c @@ -293,4 +293,4 @@ static int __init mxs_timer_init(struct device_node *np) return setup_irq(irq, &mxs_timer_irq); } -CLOCKSOURCE_OF_DECLARE(mxs, "fsl,timrot", mxs_timer_init); +TIMER_OF_DECLARE(mxs, "fsl,timrot", mxs_timer_init); diff --git a/drivers/clocksource/nomadik-mtu.c b/drivers/clocksource/nomadik-mtu.c index 7d44de304f37..8e4ddb9420c6 100644 --- a/drivers/clocksource/nomadik-mtu.c +++ b/drivers/clocksource/nomadik-mtu.c @@ -284,5 +284,5 @@ static int __init nmdk_timer_of_init(struct device_node *node) return nmdk_timer_init(base, irq, pclk, clk); } -CLOCKSOURCE_OF_DECLARE(nomadik_mtu, "st,nomadik-mtu", +TIMER_OF_DECLARE(nomadik_mtu, "st,nomadik-mtu", nmdk_timer_of_init); diff --git a/drivers/clocksource/pxa_timer.c b/drivers/clocksource/pxa_timer.c index a10fa667325f..08cd6eaf3795 100644 --- a/drivers/clocksource/pxa_timer.c +++ b/drivers/clocksource/pxa_timer.c @@ -216,7 +216,7 @@ static int __init pxa_timer_dt_init(struct device_node *np) return pxa_timer_common_init(irq, clk_get_rate(clk)); } -CLOCKSOURCE_OF_DECLARE(pxa_timer, "marvell,pxa-timer", pxa_timer_dt_init); +TIMER_OF_DECLARE(pxa_timer, "marvell,pxa-timer", pxa_timer_dt_init); /* * Legacy timer init for non device-tree boards. diff --git a/drivers/clocksource/qcom-timer.c b/drivers/clocksource/qcom-timer.c index ee358cdf4a07..89816f89ff3f 100644 --- a/drivers/clocksource/qcom-timer.c +++ b/drivers/clocksource/qcom-timer.c @@ -254,5 +254,5 @@ static int __init msm_dt_timer_init(struct device_node *np) return msm_timer_init(freq, 32, irq, !!percpu_offset); } -CLOCKSOURCE_OF_DECLARE(kpss_timer, "qcom,kpss-timer", msm_dt_timer_init); -CLOCKSOURCE_OF_DECLARE(scss_timer, "qcom,scss-timer", msm_dt_timer_init); +TIMER_OF_DECLARE(kpss_timer, "qcom,kpss-timer", msm_dt_timer_init); +TIMER_OF_DECLARE(scss_timer, "qcom,scss-timer", msm_dt_timer_init); diff --git a/drivers/clocksource/renesas-ostm.c b/drivers/clocksource/renesas-ostm.c index c76f57668fb2..6cffd7c6001a 100644 --- a/drivers/clocksource/renesas-ostm.c +++ b/drivers/clocksource/renesas-ostm.c @@ -262,4 +262,4 @@ err: return 0; } -CLOCKSOURCE_OF_DECLARE(ostm, "renesas,ostm", ostm_init); +TIMER_OF_DECLARE(ostm, "renesas,ostm", ostm_init); diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c index 49c02be50eca..c27f4c850d83 100644 --- a/drivers/clocksource/rockchip_timer.c +++ b/drivers/clocksource/rockchip_timer.c @@ -303,5 +303,5 @@ static int __init rk_timer_init(struct device_node *np) return -EINVAL; } -CLOCKSOURCE_OF_DECLARE(rk3288_timer, "rockchip,rk3288-timer", rk_timer_init); -CLOCKSOURCE_OF_DECLARE(rk3399_timer, "rockchip,rk3399-timer", rk_timer_init); +TIMER_OF_DECLARE(rk3288_timer, "rockchip,rk3288-timer", rk_timer_init); +TIMER_OF_DECLARE(rk3399_timer, "rockchip,rk3399-timer", rk_timer_init); diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c index a68e6538c809..6d5d126357c2 100644 --- a/drivers/clocksource/samsung_pwm_timer.c +++ b/drivers/clocksource/samsung_pwm_timer.c @@ -418,7 +418,7 @@ void __init samsung_pwm_clocksource_init(void __iomem *base, _samsung_pwm_clocksource_init(); } -#ifdef CONFIG_CLKSRC_OF +#ifdef CONFIG_TIMER_OF static int __init samsung_pwm_alloc(struct device_node *np, const struct samsung_pwm_variant *variant) { @@ -466,7 +466,7 @@ static int __init s3c2410_pwm_clocksource_init(struct device_node *np) { return samsung_pwm_alloc(np, &s3c24xx_variant); } -CLOCKSOURCE_OF_DECLARE(s3c2410_pwm, "samsung,s3c2410-pwm", s3c2410_pwm_clocksource_init); +TIMER_OF_DECLARE(s3c2410_pwm, "samsung,s3c2410-pwm", s3c2410_pwm_clocksource_init); static const struct samsung_pwm_variant s3c64xx_variant = { .bits = 32, @@ -479,7 +479,7 @@ static int __init s3c64xx_pwm_clocksource_init(struct device_node *np) { return samsung_pwm_alloc(np, &s3c64xx_variant); } -CLOCKSOURCE_OF_DECLARE(s3c6400_pwm, "samsung,s3c6400-pwm", s3c64xx_pwm_clocksource_init); +TIMER_OF_DECLARE(s3c6400_pwm, "samsung,s3c6400-pwm", s3c64xx_pwm_clocksource_init); static const struct samsung_pwm_variant s5p64x0_variant = { .bits = 32, @@ -492,7 +492,7 @@ static int __init s5p64x0_pwm_clocksource_init(struct device_node *np) { return samsung_pwm_alloc(np, &s5p64x0_variant); } -CLOCKSOURCE_OF_DECLARE(s5p6440_pwm, "samsung,s5p6440-pwm", s5p64x0_pwm_clocksource_init); +TIMER_OF_DECLARE(s5p6440_pwm, "samsung,s5p6440-pwm", s5p64x0_pwm_clocksource_init); static const struct samsung_pwm_variant s5p_variant = { .bits = 32, @@ -505,5 +505,5 @@ static int __init s5p_pwm_clocksource_init(struct device_node *np) { return samsung_pwm_alloc(np, &s5p_variant); } -CLOCKSOURCE_OF_DECLARE(s5pc100_pwm, "samsung,s5pc100-pwm", s5p_pwm_clocksource_init); +TIMER_OF_DECLARE(s5pc100_pwm, "samsung,s5pc100-pwm", s5p_pwm_clocksource_init); #endif diff --git a/drivers/clocksource/sun4i_timer.c b/drivers/clocksource/sun4i_timer.c index 4452d5c8f304..3e4bc64ff176 100644 --- a/drivers/clocksource/sun4i_timer.c +++ b/drivers/clocksource/sun4i_timer.c @@ -233,5 +233,5 @@ static int __init sun4i_timer_init(struct device_node *node) return ret; } -CLOCKSOURCE_OF_DECLARE(sun4i, "allwinner,sun4i-a10-timer", +TIMER_OF_DECLARE(sun4i, "allwinner,sun4i-a10-timer", sun4i_timer_init); diff --git a/drivers/clocksource/tango_xtal.c b/drivers/clocksource/tango_xtal.c index 12fcef8cf2d3..c4e1c2e6046f 100644 --- a/drivers/clocksource/tango_xtal.c +++ b/drivers/clocksource/tango_xtal.c @@ -53,4 +53,4 @@ static int __init tango_clocksource_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(tango, "sigma,tick-counter", tango_clocksource_init); +TIMER_OF_DECLARE(tango, "sigma,tick-counter", tango_clocksource_init); diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c index d4ca9962a759..828729c70a0c 100644 --- a/drivers/clocksource/tcb_clksrc.c +++ b/drivers/clocksource/tcb_clksrc.c @@ -9,6 +9,7 @@ #include <linux/ioport.h> #include <linux/io.h> #include <linux/platform_device.h> +#include <linux/syscore_ops.h> #include <linux/atmel_tc.h> @@ -40,6 +41,14 @@ */ static void __iomem *tcaddr; +static struct +{ + u32 cmr; + u32 imr; + u32 rc; + bool clken; +} tcb_cache[3]; +static u32 bmr_cache; static u64 tc_get_cycles(struct clocksource *cs) { @@ -61,12 +70,54 @@ static u64 tc_get_cycles32(struct clocksource *cs) return __raw_readl(tcaddr + ATMEL_TC_REG(0, CV)); } +void tc_clksrc_suspend(struct clocksource *cs) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tcb_cache); i++) { + tcb_cache[i].cmr = readl(tcaddr + ATMEL_TC_REG(i, CMR)); + tcb_cache[i].imr = readl(tcaddr + ATMEL_TC_REG(i, IMR)); + tcb_cache[i].rc = readl(tcaddr + ATMEL_TC_REG(i, RC)); + tcb_cache[i].clken = !!(readl(tcaddr + ATMEL_TC_REG(i, SR)) & + ATMEL_TC_CLKSTA); + } + + bmr_cache = readl(tcaddr + ATMEL_TC_BMR); +} + +void tc_clksrc_resume(struct clocksource *cs) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tcb_cache); i++) { + /* Restore registers for the channel, RA and RB are not used */ + writel(tcb_cache[i].cmr, tcaddr + ATMEL_TC_REG(i, CMR)); + writel(tcb_cache[i].rc, tcaddr + ATMEL_TC_REG(i, RC)); + writel(0, tcaddr + ATMEL_TC_REG(i, RA)); + writel(0, tcaddr + ATMEL_TC_REG(i, RB)); + /* Disable all the interrupts */ + writel(0xff, tcaddr + ATMEL_TC_REG(i, IDR)); + /* Reenable interrupts that were enabled before suspending */ + writel(tcb_cache[i].imr, tcaddr + ATMEL_TC_REG(i, IER)); + /* Start the clock if it was used */ + if (tcb_cache[i].clken) + writel(ATMEL_TC_CLKEN, tcaddr + ATMEL_TC_REG(i, CCR)); + } + + /* Dual channel, chain channels */ + writel(bmr_cache, tcaddr + ATMEL_TC_BMR); + /* Finally, trigger all the channels*/ + writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR); +} + static struct clocksource clksrc = { .name = "tcb_clksrc", .rating = 200, .read = tc_get_cycles, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .suspend = tc_clksrc_suspend, + .resume = tc_clksrc_resume, }; #ifdef CONFIG_GENERIC_CLOCKEVENTS diff --git a/drivers/clocksource/tegra20_timer.c b/drivers/clocksource/tegra20_timer.c index b9990b9c98c5..c337a8100a7b 100644 --- a/drivers/clocksource/tegra20_timer.c +++ b/drivers/clocksource/tegra20_timer.c @@ -237,7 +237,7 @@ static int __init tegra20_init_timer(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(tegra20_timer, "nvidia,tegra20-timer", tegra20_init_timer); +TIMER_OF_DECLARE(tegra20_timer, "nvidia,tegra20-timer", tegra20_init_timer); static int __init tegra20_init_rtc(struct device_node *np) { @@ -261,4 +261,4 @@ static int __init tegra20_init_rtc(struct device_node *np) return register_persistent_clock(NULL, tegra_read_persistent_clock64); } -CLOCKSOURCE_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc); +TIMER_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc); diff --git a/drivers/clocksource/time-armada-370-xp.c b/drivers/clocksource/time-armada-370-xp.c index aea4380129ea..edf1a46269f1 100644 --- a/drivers/clocksource/time-armada-370-xp.c +++ b/drivers/clocksource/time-armada-370-xp.c @@ -351,7 +351,7 @@ static int __init armada_xp_timer_init(struct device_node *np) return armada_370_xp_timer_common_init(np); } -CLOCKSOURCE_OF_DECLARE(armada_xp, "marvell,armada-xp-timer", +TIMER_OF_DECLARE(armada_xp, "marvell,armada-xp-timer", armada_xp_timer_init); static int __init armada_375_timer_init(struct device_node *np) @@ -389,7 +389,7 @@ static int __init armada_375_timer_init(struct device_node *np) return armada_370_xp_timer_common_init(np); } -CLOCKSOURCE_OF_DECLARE(armada_375, "marvell,armada-375-timer", +TIMER_OF_DECLARE(armada_375, "marvell,armada-375-timer", armada_375_timer_init); static int __init armada_370_timer_init(struct device_node *np) @@ -412,5 +412,5 @@ static int __init armada_370_timer_init(struct device_node *np) return armada_370_xp_timer_common_init(np); } -CLOCKSOURCE_OF_DECLARE(armada_370, "marvell,armada-370-timer", +TIMER_OF_DECLARE(armada_370, "marvell,armada-370-timer", armada_370_timer_init); diff --git a/drivers/clocksource/time-efm32.c b/drivers/clocksource/time-efm32.c index ce0f97b4e5db..257e810ec1ad 100644 --- a/drivers/clocksource/time-efm32.c +++ b/drivers/clocksource/time-efm32.c @@ -283,5 +283,5 @@ static int __init efm32_timer_init(struct device_node *np) return ret; } -CLOCKSOURCE_OF_DECLARE(efm32compat, "efm32,timer", efm32_timer_init); -CLOCKSOURCE_OF_DECLARE(efm32, "energymicro,efm32-timer", efm32_timer_init); +TIMER_OF_DECLARE(efm32compat, "efm32,timer", efm32_timer_init); +TIMER_OF_DECLARE(efm32, "energymicro,efm32-timer", efm32_timer_init); diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c index 9649cfdb9213..d51a62a79ef7 100644 --- a/drivers/clocksource/time-lpc32xx.c +++ b/drivers/clocksource/time-lpc32xx.c @@ -311,4 +311,4 @@ static int __init lpc32xx_timer_init(struct device_node *np) return ret; } -CLOCKSOURCE_OF_DECLARE(lpc32xx_timer, "nxp,lpc3220-timer", lpc32xx_timer_init); +TIMER_OF_DECLARE(lpc32xx_timer, "nxp,lpc3220-timer", lpc32xx_timer_init); diff --git a/drivers/clocksource/time-orion.c b/drivers/clocksource/time-orion.c index b9b97f630c4d..12202067fe4b 100644 --- a/drivers/clocksource/time-orion.c +++ b/drivers/clocksource/time-orion.c @@ -189,4 +189,4 @@ static int __init orion_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(orion_timer, "marvell,orion-timer", orion_timer_init); +TIMER_OF_DECLARE(orion_timer, "marvell,orion-timer", orion_timer_init); diff --git a/drivers/clocksource/time-pistachio.c b/drivers/clocksource/time-pistachio.c index 3710e4d9dcba..a2dd85d0c1d7 100644 --- a/drivers/clocksource/time-pistachio.c +++ b/drivers/clocksource/time-pistachio.c @@ -214,5 +214,5 @@ static int __init pistachio_clksrc_of_init(struct device_node *node) sched_clock_register(pistachio_read_sched_clock, 32, rate); return clocksource_register_hz(&pcs_gpt.cs, rate); } -CLOCKSOURCE_OF_DECLARE(pistachio_gptimer, "img,pistachio-gptimer", +TIMER_OF_DECLARE(pistachio_gptimer, "img,pistachio-gptimer", pistachio_clksrc_of_init); diff --git a/drivers/clocksource/timer-atlas7.c b/drivers/clocksource/timer-atlas7.c index 50300eec4a39..62c4bbc55a7e 100644 --- a/drivers/clocksource/timer-atlas7.c +++ b/drivers/clocksource/timer-atlas7.c @@ -283,4 +283,4 @@ static int __init sirfsoc_of_timer_init(struct device_node *np) return sirfsoc_atlas7_timer_init(np); } -CLOCKSOURCE_OF_DECLARE(sirfsoc_atlas7_timer, "sirf,atlas7-tick", sirfsoc_of_timer_init); +TIMER_OF_DECLARE(sirfsoc_atlas7_timer, "sirf,atlas7-tick", sirfsoc_of_timer_init); diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c index cc112351dc70..ec8a4376f74f 100644 --- a/drivers/clocksource/timer-atmel-pit.c +++ b/drivers/clocksource/timer-atmel-pit.c @@ -255,5 +255,5 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(at91sam926x_pit, "atmel,at91sam9260-pit", +TIMER_OF_DECLARE(at91sam926x_pit, "atmel,at91sam9260-pit", at91sam926x_pit_dt_init); diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c index be4ac7604136..d2e660f475af 100644 --- a/drivers/clocksource/timer-atmel-st.c +++ b/drivers/clocksource/timer-atmel-st.c @@ -260,5 +260,5 @@ static int __init atmel_st_timer_init(struct device_node *node) /* register clocksource */ return clocksource_register_hz(&clk32k, sclk_rate); } -CLOCKSOURCE_OF_DECLARE(atmel_st_timer, "atmel,at91rm9200-st", +TIMER_OF_DECLARE(atmel_st_timer, "atmel,at91rm9200-st", atmel_st_timer_init); diff --git a/drivers/clocksource/timer-digicolor.c b/drivers/clocksource/timer-digicolor.c index 94a161eb9cce..1e984a4d8ad0 100644 --- a/drivers/clocksource/timer-digicolor.c +++ b/drivers/clocksource/timer-digicolor.c @@ -203,5 +203,5 @@ static int __init digicolor_timer_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(conexant_digicolor, "cnxt,cx92755-timer", +TIMER_OF_DECLARE(conexant_digicolor, "cnxt,cx92755-timer", digicolor_timer_init); diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index b4a6f1e4bc54..66dd909960c6 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -11,12 +11,13 @@ #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> -#include <linux/mfd/syscon.h> -#include <linux/regmap.h> #include <linux/clockchips.h> #include <linux/clocksource.h> #include <linux/sched_clock.h> #include <linux/clk.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/delay.h> /* * Register definitions for the timers @@ -37,267 +38,368 @@ #define TIMER_INTR_STATE (0x34) #define TIMER_INTR_MASK (0x38) -#define TIMER_1_CR_ENABLE (1 << 0) -#define TIMER_1_CR_CLOCK (1 << 1) -#define TIMER_1_CR_INT (1 << 2) -#define TIMER_2_CR_ENABLE (1 << 3) -#define TIMER_2_CR_CLOCK (1 << 4) -#define TIMER_2_CR_INT (1 << 5) -#define TIMER_3_CR_ENABLE (1 << 6) -#define TIMER_3_CR_CLOCK (1 << 7) -#define TIMER_3_CR_INT (1 << 8) -#define TIMER_1_CR_UPDOWN (1 << 9) -#define TIMER_2_CR_UPDOWN (1 << 10) -#define TIMER_3_CR_UPDOWN (1 << 11) -#define TIMER_DEFAULT_FLAGS (TIMER_1_CR_UPDOWN | \ - TIMER_3_CR_ENABLE | \ - TIMER_3_CR_UPDOWN) - -#define TIMER_1_INT_MATCH1 (1 << 0) -#define TIMER_1_INT_MATCH2 (1 << 1) -#define TIMER_1_INT_OVERFLOW (1 << 2) -#define TIMER_2_INT_MATCH1 (1 << 3) -#define TIMER_2_INT_MATCH2 (1 << 4) -#define TIMER_2_INT_OVERFLOW (1 << 5) -#define TIMER_3_INT_MATCH1 (1 << 6) -#define TIMER_3_INT_MATCH2 (1 << 7) -#define TIMER_3_INT_OVERFLOW (1 << 8) +#define TIMER_1_CR_ENABLE BIT(0) +#define TIMER_1_CR_CLOCK BIT(1) +#define TIMER_1_CR_INT BIT(2) +#define TIMER_2_CR_ENABLE BIT(3) +#define TIMER_2_CR_CLOCK BIT(4) +#define TIMER_2_CR_INT BIT(5) +#define TIMER_3_CR_ENABLE BIT(6) +#define TIMER_3_CR_CLOCK BIT(7) +#define TIMER_3_CR_INT BIT(8) +#define TIMER_1_CR_UPDOWN BIT(9) +#define TIMER_2_CR_UPDOWN BIT(10) +#define TIMER_3_CR_UPDOWN BIT(11) + +/* + * The Aspeed AST2400 moves bits around in the control register + * and lacks bits for setting the timer to count upwards. + */ +#define TIMER_1_CR_ASPEED_ENABLE BIT(0) +#define TIMER_1_CR_ASPEED_CLOCK BIT(1) +#define TIMER_1_CR_ASPEED_INT BIT(2) +#define TIMER_2_CR_ASPEED_ENABLE BIT(4) +#define TIMER_2_CR_ASPEED_CLOCK BIT(5) +#define TIMER_2_CR_ASPEED_INT BIT(6) +#define TIMER_3_CR_ASPEED_ENABLE BIT(8) +#define TIMER_3_CR_ASPEED_CLOCK BIT(9) +#define TIMER_3_CR_ASPEED_INT BIT(10) + +#define TIMER_1_INT_MATCH1 BIT(0) +#define TIMER_1_INT_MATCH2 BIT(1) +#define TIMER_1_INT_OVERFLOW BIT(2) +#define TIMER_2_INT_MATCH1 BIT(3) +#define TIMER_2_INT_MATCH2 BIT(4) +#define TIMER_2_INT_OVERFLOW BIT(5) +#define TIMER_3_INT_MATCH1 BIT(6) +#define TIMER_3_INT_MATCH2 BIT(7) +#define TIMER_3_INT_OVERFLOW BIT(8) #define TIMER_INT_ALL_MASK 0x1ff -static unsigned int tick_rate; -static void __iomem *base; +struct fttmr010 { + void __iomem *base; + unsigned int tick_rate; + bool count_down; + u32 t1_enable_val; + struct clock_event_device clkevt; +#ifdef CONFIG_ARM + struct delay_timer delay_timer; +#endif +}; + +/* + * A local singleton used by sched_clock and delay timer reads, which are + * fast and stateless + */ +static struct fttmr010 *local_fttmr; + +static inline struct fttmr010 *to_fttmr010(struct clock_event_device *evt) +{ + return container_of(evt, struct fttmr010, clkevt); +} + +static unsigned long fttmr010_read_current_timer_up(void) +{ + return readl(local_fttmr->base + TIMER2_COUNT); +} + +static unsigned long fttmr010_read_current_timer_down(void) +{ + return ~readl(local_fttmr->base + TIMER2_COUNT); +} + +static u64 notrace fttmr010_read_sched_clock_up(void) +{ + return fttmr010_read_current_timer_up(); +} -static u64 notrace fttmr010_read_sched_clock(void) +static u64 notrace fttmr010_read_sched_clock_down(void) { - return readl(base + TIMER3_COUNT); + return fttmr010_read_current_timer_down(); } static int fttmr010_timer_set_next_event(unsigned long cycles, struct clock_event_device *evt) { + struct fttmr010 *fttmr010 = to_fttmr010(evt); u32 cr; - /* Setup the match register */ - cr = readl(base + TIMER1_COUNT); - writel(cr + cycles, base + TIMER1_MATCH1); - if (readl(base + TIMER1_COUNT) - cr > cycles) - return -ETIME; + /* Stop */ + cr = readl(fttmr010->base + TIMER_CR); + cr &= ~fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); + + /* Setup the match register forward/backward in time */ + cr = readl(fttmr010->base + TIMER1_COUNT); + if (fttmr010->count_down) + cr -= cycles; + else + cr += cycles; + writel(cr, fttmr010->base + TIMER1_MATCH1); + + /* Start */ + cr = readl(fttmr010->base + TIMER_CR); + cr |= fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); return 0; } static int fttmr010_timer_shutdown(struct clock_event_device *evt) { + struct fttmr010 *fttmr010 = to_fttmr010(evt); u32 cr; - /* - * Disable also for oneshot: the set_next() call will arm the timer - * instead. - */ - /* Stop timer and interrupt. */ - cr = readl(base + TIMER_CR); - cr &= ~(TIMER_1_CR_ENABLE | TIMER_1_CR_INT); - writel(cr, base + TIMER_CR); + /* Stop */ + cr = readl(fttmr010->base + TIMER_CR); + cr &= ~fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); + + return 0; +} + +static int fttmr010_timer_set_oneshot(struct clock_event_device *evt) +{ + struct fttmr010 *fttmr010 = to_fttmr010(evt); + u32 cr; + + /* Stop */ + cr = readl(fttmr010->base + TIMER_CR); + cr &= ~fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); - /* Setup counter start from 0 */ - writel(0, base + TIMER1_COUNT); - writel(0, base + TIMER1_LOAD); + /* Setup counter start from 0 or ~0 */ + writel(0, fttmr010->base + TIMER1_COUNT); + if (fttmr010->count_down) + writel(~0, fttmr010->base + TIMER1_LOAD); + else + writel(0, fttmr010->base + TIMER1_LOAD); - /* enable interrupt */ - cr = readl(base + TIMER_INTR_MASK); + /* Enable interrupt */ + cr = readl(fttmr010->base + TIMER_INTR_MASK); cr &= ~(TIMER_1_INT_OVERFLOW | TIMER_1_INT_MATCH2); cr |= TIMER_1_INT_MATCH1; - writel(cr, base + TIMER_INTR_MASK); - - /* start the timer */ - cr = readl(base + TIMER_CR); - cr |= TIMER_1_CR_ENABLE; - writel(cr, base + TIMER_CR); + writel(cr, fttmr010->base + TIMER_INTR_MASK); return 0; } static int fttmr010_timer_set_periodic(struct clock_event_device *evt) { - u32 period = DIV_ROUND_CLOSEST(tick_rate, HZ); + struct fttmr010 *fttmr010 = to_fttmr010(evt); + u32 period = DIV_ROUND_CLOSEST(fttmr010->tick_rate, HZ); u32 cr; - /* Stop timer and interrupt */ - cr = readl(base + TIMER_CR); - cr &= ~(TIMER_1_CR_ENABLE | TIMER_1_CR_INT); - writel(cr, base + TIMER_CR); - - /* Setup timer to fire at 1/HT intervals. */ - cr = 0xffffffff - (period - 1); - writel(cr, base + TIMER1_COUNT); - writel(cr, base + TIMER1_LOAD); - - /* enable interrupt on overflow */ - cr = readl(base + TIMER_INTR_MASK); - cr &= ~(TIMER_1_INT_MATCH1 | TIMER_1_INT_MATCH2); - cr |= TIMER_1_INT_OVERFLOW; - writel(cr, base + TIMER_INTR_MASK); + /* Stop */ + cr = readl(fttmr010->base + TIMER_CR); + cr &= ~fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); + + /* Setup timer to fire at 1/HZ intervals. */ + if (fttmr010->count_down) { + writel(period, fttmr010->base + TIMER1_LOAD); + writel(0, fttmr010->base + TIMER1_MATCH1); + } else { + cr = 0xffffffff - (period - 1); + writel(cr, fttmr010->base + TIMER1_COUNT); + writel(cr, fttmr010->base + TIMER1_LOAD); + + /* Enable interrupt on overflow */ + cr = readl(fttmr010->base + TIMER_INTR_MASK); + cr &= ~(TIMER_1_INT_MATCH1 | TIMER_1_INT_MATCH2); + cr |= TIMER_1_INT_OVERFLOW; + writel(cr, fttmr010->base + TIMER_INTR_MASK); + } /* Start the timer */ - cr = readl(base + TIMER_CR); - cr |= TIMER_1_CR_ENABLE; - cr |= TIMER_1_CR_INT; - writel(cr, base + TIMER_CR); + cr = readl(fttmr010->base + TIMER_CR); + cr |= fttmr010->t1_enable_val; + writel(cr, fttmr010->base + TIMER_CR); return 0; } -/* Use TIMER1 as clock event */ -static struct clock_event_device fttmr010_clockevent = { - .name = "TIMER1", - /* Reasonably fast and accurate clock event */ - .rating = 300, - .shift = 32, - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = fttmr010_timer_set_next_event, - .set_state_shutdown = fttmr010_timer_shutdown, - .set_state_periodic = fttmr010_timer_set_periodic, - .set_state_oneshot = fttmr010_timer_shutdown, - .tick_resume = fttmr010_timer_shutdown, -}; - /* * IRQ handler for the timer */ static irqreturn_t fttmr010_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = &fttmr010_clockevent; + struct clock_event_device *evt = dev_id; evt->event_handler(evt); return IRQ_HANDLED; } -static struct irqaction fttmr010_timer_irq = { - .name = "Faraday FTTMR010 Timer Tick", - .flags = IRQF_TIMER, - .handler = fttmr010_timer_interrupt, -}; - -static int __init fttmr010_timer_common_init(struct device_node *np) +static int __init fttmr010_common_init(struct device_node *np, bool is_aspeed) { + struct fttmr010 *fttmr010; int irq; + struct clk *clk; + int ret; + u32 val; + + /* + * These implementations require a clock reference. + * FIXME: we currently only support clocking using PCLK + * and using EXTCLK is not supported in the driver. + */ + clk = of_clk_get_by_name(np, "PCLK"); + if (IS_ERR(clk)) { + pr_err("could not get PCLK\n"); + return PTR_ERR(clk); + } + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("failed to enable PCLK\n"); + return ret; + } - base = of_iomap(np, 0); - if (!base) { + fttmr010 = kzalloc(sizeof(*fttmr010), GFP_KERNEL); + if (!fttmr010) { + ret = -ENOMEM; + goto out_disable_clock; + } + fttmr010->tick_rate = clk_get_rate(clk); + + fttmr010->base = of_iomap(np, 0); + if (!fttmr010->base) { pr_err("Can't remap registers"); - return -ENXIO; + ret = -ENXIO; + goto out_free; } /* IRQ for timer 1 */ irq = irq_of_parse_and_map(np, 0); if (irq <= 0) { pr_err("Can't parse IRQ"); - return -EINVAL; + ret = -EINVAL; + goto out_unmap; + } + + /* + * The Aspeed AST2400 moves bits around in the control register, + * otherwise it works the same. + */ + if (is_aspeed) { + fttmr010->t1_enable_val = TIMER_1_CR_ASPEED_ENABLE | + TIMER_1_CR_ASPEED_INT; + /* Downward not available */ + fttmr010->count_down = true; + } else { + fttmr010->t1_enable_val = TIMER_1_CR_ENABLE | TIMER_1_CR_INT; } /* * Reset the interrupt mask and status */ - writel(TIMER_INT_ALL_MASK, base + TIMER_INTR_MASK); - writel(0, base + TIMER_INTR_STATE); - writel(TIMER_DEFAULT_FLAGS, base + TIMER_CR); + writel(TIMER_INT_ALL_MASK, fttmr010->base + TIMER_INTR_MASK); + writel(0, fttmr010->base + TIMER_INTR_STATE); + + /* + * Enable timer 1 count up, timer 2 count up, except on Aspeed, + * where everything just counts down. + */ + if (is_aspeed) + val = TIMER_2_CR_ASPEED_ENABLE; + else { + val = TIMER_2_CR_ENABLE; + if (!fttmr010->count_down) + val |= TIMER_1_CR_UPDOWN | TIMER_2_CR_UPDOWN; + } + writel(val, fttmr010->base + TIMER_CR); /* * Setup free-running clocksource timer (interrupts * disabled.) */ - writel(0, base + TIMER3_COUNT); - writel(0, base + TIMER3_LOAD); - writel(0, base + TIMER3_MATCH1); - writel(0, base + TIMER3_MATCH2); - clocksource_mmio_init(base + TIMER3_COUNT, - "fttmr010_clocksource", tick_rate, - 300, 32, clocksource_mmio_readl_up); - sched_clock_register(fttmr010_read_sched_clock, 32, tick_rate); + local_fttmr = fttmr010; + writel(0, fttmr010->base + TIMER2_COUNT); + writel(0, fttmr010->base + TIMER2_MATCH1); + writel(0, fttmr010->base + TIMER2_MATCH2); + + if (fttmr010->count_down) { + writel(~0, fttmr010->base + TIMER2_LOAD); + clocksource_mmio_init(fttmr010->base + TIMER2_COUNT, + "FTTMR010-TIMER2", + fttmr010->tick_rate, + 300, 32, clocksource_mmio_readl_down); + sched_clock_register(fttmr010_read_sched_clock_down, 32, + fttmr010->tick_rate); + } else { + writel(0, fttmr010->base + TIMER2_LOAD); + clocksource_mmio_init(fttmr010->base + TIMER2_COUNT, + "FTTMR010-TIMER2", + fttmr010->tick_rate, + 300, 32, clocksource_mmio_readl_up); + sched_clock_register(fttmr010_read_sched_clock_up, 32, + fttmr010->tick_rate); + } /* - * Setup clockevent timer (interrupt-driven.) + * Setup clockevent timer (interrupt-driven) on timer 1. */ - writel(0, base + TIMER1_COUNT); - writel(0, base + TIMER1_LOAD); - writel(0, base + TIMER1_MATCH1); - writel(0, base + TIMER1_MATCH2); - setup_irq(irq, &fttmr010_timer_irq); - fttmr010_clockevent.cpumask = cpumask_of(0); - clockevents_config_and_register(&fttmr010_clockevent, tick_rate, + writel(0, fttmr010->base + TIMER1_COUNT); + writel(0, fttmr010->base + TIMER1_LOAD); + writel(0, fttmr010->base + TIMER1_MATCH1); + writel(0, fttmr010->base + TIMER1_MATCH2); + ret = request_irq(irq, fttmr010_timer_interrupt, IRQF_TIMER, + "FTTMR010-TIMER1", &fttmr010->clkevt); + if (ret) { + pr_err("FTTMR010-TIMER1 no IRQ\n"); + goto out_unmap; + } + + fttmr010->clkevt.name = "FTTMR010-TIMER1"; + /* Reasonably fast and accurate clock event */ + fttmr010->clkevt.rating = 300; + fttmr010->clkevt.features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT; + fttmr010->clkevt.set_next_event = fttmr010_timer_set_next_event; + fttmr010->clkevt.set_state_shutdown = fttmr010_timer_shutdown; + fttmr010->clkevt.set_state_periodic = fttmr010_timer_set_periodic; + fttmr010->clkevt.set_state_oneshot = fttmr010_timer_set_oneshot; + fttmr010->clkevt.tick_resume = fttmr010_timer_shutdown; + fttmr010->clkevt.cpumask = cpumask_of(0); + fttmr010->clkevt.irq = irq; + clockevents_config_and_register(&fttmr010->clkevt, + fttmr010->tick_rate, 1, 0xffffffff); - return 0; -} +#ifdef CONFIG_ARM + /* Also use this timer for delays */ + if (fttmr010->count_down) + fttmr010->delay_timer.read_current_timer = + fttmr010_read_current_timer_down; + else + fttmr010->delay_timer.read_current_timer = + fttmr010_read_current_timer_up; + fttmr010->delay_timer.freq = fttmr010->tick_rate; + register_current_timer_delay(&fttmr010->delay_timer); +#endif -static int __init fttmr010_timer_of_init(struct device_node *np) -{ - /* - * These implementations require a clock reference. - * FIXME: we currently only support clocking using PCLK - * and using EXTCLK is not supported in the driver. - */ - struct clk *clk; + return 0; - clk = of_clk_get_by_name(np, "PCLK"); - if (IS_ERR(clk)) { - pr_err("could not get PCLK"); - return PTR_ERR(clk); - } - tick_rate = clk_get_rate(clk); +out_unmap: + iounmap(fttmr010->base); +out_free: + kfree(fttmr010); +out_disable_clock: + clk_disable_unprepare(clk); - return fttmr010_timer_common_init(np); + return ret; } -CLOCKSOURCE_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_of_init); -/* - * Gemini-specific: relevant registers in the global syscon - */ -#define GLOBAL_STATUS 0x04 -#define CPU_AHB_RATIO_MASK (0x3 << 18) -#define CPU_AHB_1_1 (0x0 << 18) -#define CPU_AHB_3_2 (0x1 << 18) -#define CPU_AHB_24_13 (0x2 << 18) -#define CPU_AHB_2_1 (0x3 << 18) -#define REG_TO_AHB_SPEED(reg) ((((reg) >> 15) & 0x7) * 10 + 130) - -static int __init gemini_timer_of_init(struct device_node *np) +static __init int aspeed_timer_init(struct device_node *np) { - static struct regmap *map; - int ret; - u32 val; - - map = syscon_regmap_lookup_by_phandle(np, "syscon"); - if (IS_ERR(map)) { - pr_err("Can't get regmap for syscon handle\n"); - return -ENODEV; - } - ret = regmap_read(map, GLOBAL_STATUS, &val); - if (ret) { - pr_err("Can't read syscon status register\n"); - return -ENXIO; - } - - tick_rate = REG_TO_AHB_SPEED(val) * 1000000; - pr_info("Bus: %dMHz ", tick_rate / 1000000); - - tick_rate /= 6; /* APB bus run AHB*(1/6) */ - - switch (val & CPU_AHB_RATIO_MASK) { - case CPU_AHB_1_1: - pr_cont("(1/1)\n"); - break; - case CPU_AHB_3_2: - pr_cont("(3/2)\n"); - break; - case CPU_AHB_24_13: - pr_cont("(24/13)\n"); - break; - case CPU_AHB_2_1: - pr_cont("(2/1)\n"); - break; - } + return fttmr010_common_init(np, true); +} - return fttmr010_timer_common_init(np); +static __init int fttmr010_timer_init(struct device_node *np) +{ + return fttmr010_common_init(np, false); } -CLOCKSOURCE_OF_DECLARE(gemini, "cortina,gemini-timer", gemini_timer_of_init); + +TIMER_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_init); +TIMER_OF_DECLARE(gemini, "cortina,gemini-timer", fttmr010_timer_init); +TIMER_OF_DECLARE(moxart, "moxa,moxart-timer", fttmr010_timer_init); +TIMER_OF_DECLARE(ast2400, "aspeed,ast2400-timer", aspeed_timer_init); +TIMER_OF_DECLARE(ast2500, "aspeed,ast2500-timer", aspeed_timer_init); diff --git a/drivers/clocksource/timer-imx-gpt.c b/drivers/clocksource/timer-imx-gpt.c index f595460bfc58..6ec6d79b237c 100644 --- a/drivers/clocksource/timer-imx-gpt.c +++ b/drivers/clocksource/timer-imx-gpt.c @@ -545,15 +545,15 @@ static int __init imx6dl_timer_init_dt(struct device_node *np) return mxc_timer_init_dt(np, GPT_TYPE_IMX6DL); } -CLOCKSOURCE_OF_DECLARE(imx1_timer, "fsl,imx1-gpt", imx1_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx21_timer, "fsl,imx21-gpt", imx21_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx27_timer, "fsl,imx27-gpt", imx21_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx31_timer, "fsl,imx31-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx25_timer, "fsl,imx25-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx50_timer, "fsl,imx50-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx51_timer, "fsl,imx51-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx53_timer, "fsl,imx53-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx6q_timer, "fsl,imx6q-gpt", imx31_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx6dl_timer, "fsl,imx6dl-gpt", imx6dl_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx6sl_timer, "fsl,imx6sl-gpt", imx6dl_timer_init_dt); -CLOCKSOURCE_OF_DECLARE(imx6sx_timer, "fsl,imx6sx-gpt", imx6dl_timer_init_dt); +TIMER_OF_DECLARE(imx1_timer, "fsl,imx1-gpt", imx1_timer_init_dt); +TIMER_OF_DECLARE(imx21_timer, "fsl,imx21-gpt", imx21_timer_init_dt); +TIMER_OF_DECLARE(imx27_timer, "fsl,imx27-gpt", imx21_timer_init_dt); +TIMER_OF_DECLARE(imx31_timer, "fsl,imx31-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx25_timer, "fsl,imx25-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx50_timer, "fsl,imx50-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx51_timer, "fsl,imx51-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx53_timer, "fsl,imx53-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx6q_timer, "fsl,imx6q-gpt", imx31_timer_init_dt); +TIMER_OF_DECLARE(imx6dl_timer, "fsl,imx6dl-gpt", imx6dl_timer_init_dt); +TIMER_OF_DECLARE(imx6sl_timer, "fsl,imx6sl-gpt", imx6dl_timer_init_dt); +TIMER_OF_DECLARE(imx6sx_timer, "fsl,imx6sx-gpt", imx6dl_timer_init_dt); diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index 04ad3066e190..2ff64d9d4fb3 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -232,5 +232,5 @@ static int __init integrator_ap_timer_init_of(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(integrator_ap_timer, "arm,integrator-timer", +TIMER_OF_DECLARE(integrator_ap_timer, "arm,integrator-timer", integrator_ap_timer_init_of); diff --git a/drivers/clocksource/timer-keystone.c b/drivers/clocksource/timer-keystone.c index ab68a47ab3b4..0eee03250cfc 100644 --- a/drivers/clocksource/timer-keystone.c +++ b/drivers/clocksource/timer-keystone.c @@ -226,5 +226,5 @@ err: return error; } -CLOCKSOURCE_OF_DECLARE(keystone_timer, "ti,keystone-timer", +TIMER_OF_DECLARE(keystone_timer, "ti,keystone-timer", keystone_timer_init); diff --git a/drivers/clocksource/timer-nps.c b/drivers/clocksource/timer-nps.c index e74ea1722ad3..7b6bb0df96ae 100644 --- a/drivers/clocksource/timer-nps.c +++ b/drivers/clocksource/timer-nps.c @@ -110,9 +110,9 @@ static int __init nps_setup_clocksource(struct device_node *node) return ret; } -CLOCKSOURCE_OF_DECLARE(ezchip_nps400_clksrc, "ezchip,nps400-timer", +TIMER_OF_DECLARE(ezchip_nps400_clksrc, "ezchip,nps400-timer", nps_setup_clocksource); -CLOCKSOURCE_OF_DECLARE(ezchip_nps400_clk_src, "ezchip,nps400-timer1", +TIMER_OF_DECLARE(ezchip_nps400_clk_src, "ezchip,nps400-timer1", nps_setup_clocksource); #ifdef CONFIG_EZNPS_MTM_EXT @@ -279,6 +279,6 @@ static int __init nps_setup_clockevent(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(ezchip_nps400_clk_evt, "ezchip,nps400-timer0", +TIMER_OF_DECLARE(ezchip_nps400_clk_evt, "ezchip,nps400-timer0", nps_setup_clockevent); #endif /* CONFIG_EZNPS_MTM_EXT */ diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c new file mode 100644 index 000000000000..64b1c2081a67 --- /dev/null +++ b/drivers/clocksource/timer-of.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2017, Linaro Ltd. All rights reserved. + * + * Author: Daniel Lezcano <daniel.lezcano@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/clk.h> +#include <linux/interrupt.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/slab.h> + +#include "timer-of.h" + +static __init void timer_irq_exit(struct of_timer_irq *of_irq) +{ + struct timer_of *to = container_of(of_irq, struct timer_of, of_irq); + + struct clock_event_device *clkevt = &to->clkevt; + + of_irq->percpu ? free_percpu_irq(of_irq->irq, clkevt) : + free_irq(of_irq->irq, clkevt); +} + +static __init int timer_irq_init(struct device_node *np, + struct of_timer_irq *of_irq) +{ + int ret; + struct timer_of *to = container_of(of_irq, struct timer_of, of_irq); + struct clock_event_device *clkevt = &to->clkevt; + + of_irq->irq = of_irq->name ? of_irq_get_byname(np, of_irq->name): + irq_of_parse_and_map(np, of_irq->index); + if (!of_irq->irq) { + pr_err("Failed to map interrupt for %s\n", np->full_name); + return -EINVAL; + } + + ret = of_irq->percpu ? + request_percpu_irq(of_irq->irq, of_irq->handler, + np->full_name, clkevt) : + request_irq(of_irq->irq, of_irq->handler, + of_irq->flags ? of_irq->flags : IRQF_TIMER, + np->full_name, clkevt); + if (ret) { + pr_err("Failed to request irq %d for %s\n", of_irq->irq, + np->full_name); + return ret; + } + + clkevt->irq = of_irq->irq; + + return 0; +} + +static __init void timer_clk_exit(struct of_timer_clk *of_clk) +{ + of_clk->rate = 0; + clk_disable_unprepare(of_clk->clk); + clk_put(of_clk->clk); +} + +static __init int timer_clk_init(struct device_node *np, + struct of_timer_clk *of_clk) +{ + int ret; + + of_clk->clk = of_clk->name ? of_clk_get_by_name(np, of_clk->name) : + of_clk_get(np, of_clk->index); + if (IS_ERR(of_clk->clk)) { + pr_err("Failed to get clock for %s\n", np->full_name); + return PTR_ERR(of_clk->clk); + } + + ret = clk_prepare_enable(of_clk->clk); + if (ret) { + pr_err("Failed for enable clock for %s\n", np->full_name); + goto out_clk_put; + } + + of_clk->rate = clk_get_rate(of_clk->clk); + if (!of_clk->rate) { + ret = -EINVAL; + pr_err("Failed to get clock rate for %s\n", np->full_name); + goto out_clk_disable; + } + + of_clk->period = DIV_ROUND_UP(of_clk->rate, HZ); +out: + return ret; + +out_clk_disable: + clk_disable_unprepare(of_clk->clk); +out_clk_put: + clk_put(of_clk->clk); + + goto out; +} + +static __init void timer_base_exit(struct of_timer_base *of_base) +{ + iounmap(of_base->base); +} + +static __init int timer_base_init(struct device_node *np, + struct of_timer_base *of_base) +{ + const char *name = of_base->name ? of_base->name : np->full_name; + + of_base->base = of_io_request_and_map(np, of_base->index, name); + if (of_base->base) { + pr_err("Failed to iomap (%s)\n", name); + return -ENXIO; + } + + return 0; +} + +int __init timer_of_init(struct device_node *np, struct timer_of *to) +{ + int ret = -EINVAL; + int flags = 0; + + if (to->flags & TIMER_OF_BASE) { + ret = timer_base_init(np, &to->of_base); + if (ret) + goto out_fail; + flags |= TIMER_OF_BASE; + } + + if (to->flags & TIMER_OF_CLOCK) { + ret = timer_clk_init(np, &to->of_clk); + if (ret) + goto out_fail; + flags |= TIMER_OF_CLOCK; + } + + if (to->flags & TIMER_OF_IRQ) { + ret = timer_irq_init(np, &to->of_irq); + if (ret) + goto out_fail; + flags |= TIMER_OF_IRQ; + } + + if (!to->clkevt.name) + to->clkevt.name = np->name; + return ret; + +out_fail: + if (flags & TIMER_OF_IRQ) + timer_irq_exit(&to->of_irq); + + if (flags & TIMER_OF_CLOCK) + timer_clk_exit(&to->of_clk); + + if (flags & TIMER_OF_BASE) + timer_base_exit(&to->of_base); + return ret; +} diff --git a/drivers/clocksource/timer-of.h b/drivers/clocksource/timer-of.h new file mode 100644 index 000000000000..e0d727255f72 --- /dev/null +++ b/drivers/clocksource/timer-of.h @@ -0,0 +1,69 @@ +#ifndef __TIMER_OF_H__ +#define __TIMER_OF_H__ + +#include <linux/clockchips.h> + +#define TIMER_OF_BASE 0x1 +#define TIMER_OF_CLOCK 0x2 +#define TIMER_OF_IRQ 0x4 + +struct of_timer_irq { + int irq; + int index; + int percpu; + const char *name; + unsigned long flags; + irq_handler_t handler; +}; + +struct of_timer_base { + void __iomem *base; + const char *name; + int index; +}; + +struct of_timer_clk { + struct clk *clk; + const char *name; + int index; + unsigned long rate; + unsigned long period; +}; + +struct timer_of { + unsigned int flags; + struct clock_event_device clkevt; + struct of_timer_base of_base; + struct of_timer_irq of_irq; + struct of_timer_clk of_clk; + void *private_data; +}; + +static inline struct timer_of *to_timer_of(struct clock_event_device *clkevt) +{ + return container_of(clkevt, struct timer_of, clkevt); +} + +static inline void __iomem *timer_of_base(struct timer_of *to) +{ + return to->of_base.base; +} + +static inline int timer_of_irq(struct timer_of *to) +{ + return to->of_irq.irq; +} + +static inline unsigned long timer_of_rate(struct timer_of *to) +{ + return to->of_clk.rate; +} + +static inline unsigned long timer_of_period(struct timer_of *to) +{ + return to->of_clk.period; +} + +extern int __init timer_of_init(struct device_node *np, + struct timer_of *to); +#endif diff --git a/drivers/clocksource/timer-oxnas-rps.c b/drivers/clocksource/timer-oxnas-rps.c index d630bf417773..eed6feff8b5f 100644 --- a/drivers/clocksource/timer-oxnas-rps.c +++ b/drivers/clocksource/timer-oxnas-rps.c @@ -293,7 +293,7 @@ err_alloc: return ret; } -CLOCKSOURCE_OF_DECLARE(ox810se_rps, +TIMER_OF_DECLARE(ox810se_rps, "oxsemi,ox810se-rps-timer", oxnas_rps_timer_init); -CLOCKSOURCE_OF_DECLARE(ox820_rps, +TIMER_OF_DECLARE(ox820_rps, "oxsemi,ox820se-rps-timer", oxnas_rps_timer_init); diff --git a/drivers/clocksource/timer-prima2.c b/drivers/clocksource/timer-prima2.c index b4122ed1accb..20ff33b698df 100644 --- a/drivers/clocksource/timer-prima2.c +++ b/drivers/clocksource/timer-prima2.c @@ -245,5 +245,5 @@ static int __init sirfsoc_prima2_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(sirfsoc_prima2_timer, +TIMER_OF_DECLARE(sirfsoc_prima2_timer, "sirf,prima2-tick", sirfsoc_prima2_timer_init); diff --git a/drivers/clocksource/clksrc-probe.c b/drivers/clocksource/timer-probe.c index ac701ffb8d59..da81e5de74fe 100644 --- a/drivers/clocksource/clksrc-probe.c +++ b/drivers/clocksource/timer-probe.c @@ -19,20 +19,20 @@ #include <linux/of.h> #include <linux/clocksource.h> -extern struct of_device_id __clksrc_of_table[]; +extern struct of_device_id __timer_of_table[]; -static const struct of_device_id __clksrc_of_table_sentinel - __used __section(__clksrc_of_table_end); +static const struct of_device_id __timer_of_table_sentinel + __used __section(__timer_of_table_end); -void __init clocksource_probe(void) +void __init timer_probe(void) { struct device_node *np; const struct of_device_id *match; of_init_fn_1_ret init_func_ret; - unsigned clocksources = 0; + unsigned timers = 0; int ret; - for_each_matching_node_and_match(np, __clksrc_of_table, &match) { + for_each_matching_node_and_match(np, __timer_of_table, &match) { if (!of_device_is_available(np)) continue; @@ -45,11 +45,11 @@ void __init clocksource_probe(void) continue; } - clocksources++; + timers++; } - clocksources += acpi_probe_device_table(clksrc); + timers += acpi_probe_device_table(timer); - if (!clocksources) - pr_crit("%s: no matching clocksources found\n", __func__); + if (!timers) + pr_crit("%s: no matching timers found\n", __func__); } diff --git a/drivers/clocksource/timer-sp804.c b/drivers/clocksource/timer-sp804.c index 2d575a8c0939..3ac9dec9a038 100644 --- a/drivers/clocksource/timer-sp804.c +++ b/drivers/clocksource/timer-sp804.c @@ -287,7 +287,7 @@ err: iounmap(base); return ret; } -CLOCKSOURCE_OF_DECLARE(sp804, "arm,sp804", sp804_of_init); +TIMER_OF_DECLARE(sp804, "arm,sp804", sp804_of_init); static int __init integrator_cp_of_init(struct device_node *np) { @@ -335,4 +335,4 @@ err: iounmap(base); return ret; } -CLOCKSOURCE_OF_DECLARE(intcp, "arm,integrator-cp-timer", integrator_cp_of_init); +TIMER_OF_DECLARE(intcp, "arm,integrator-cp-timer", integrator_cp_of_init); diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c index 1b2574c4fb97..174d1243ea93 100644 --- a/drivers/clocksource/timer-stm32.c +++ b/drivers/clocksource/timer-stm32.c @@ -187,4 +187,4 @@ err_clk_get: return ret; } -CLOCKSOURCE_OF_DECLARE(stm32, "st,stm32-timer", stm32_clockevent_init); +TIMER_OF_DECLARE(stm32, "st,stm32-timer", stm32_clockevent_init); diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index c4656c4d44a6..2a3fe83ec337 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -359,7 +359,7 @@ static int __init sun5i_timer_init(struct device_node *node) return sun5i_setup_clockevent(node, timer_base, clk, irq); } -CLOCKSOURCE_OF_DECLARE(sun5i_a13, "allwinner,sun5i-a13-hstimer", +TIMER_OF_DECLARE(sun5i_a13, "allwinner,sun5i-a13-hstimer", sun5i_timer_init); -CLOCKSOURCE_OF_DECLARE(sun7i_a20, "allwinner,sun7i-a20-hstimer", +TIMER_OF_DECLARE(sun7i_a20, "allwinner,sun7i-a20-hstimer", sun5i_timer_init); diff --git a/drivers/clocksource/timer-ti-32k.c b/drivers/clocksource/timer-ti-32k.c index 624067712ef0..880a861ab3c8 100644 --- a/drivers/clocksource/timer-ti-32k.c +++ b/drivers/clocksource/timer-ti-32k.c @@ -124,5 +124,5 @@ static int __init ti_32k_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(ti_32k_timer, "ti,omap-counter32k", +TIMER_OF_DECLARE(ti_32k_timer, "ti,omap-counter32k", ti_32k_timer_init); diff --git a/drivers/clocksource/timer-u300.c b/drivers/clocksource/timer-u300.c index 704e40c6f151..be34b116d4d2 100644 --- a/drivers/clocksource/timer-u300.c +++ b/drivers/clocksource/timer-u300.c @@ -458,5 +458,5 @@ static int __init u300_timer_init_of(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(u300_timer, "stericsson,u300-apptimer", +TIMER_OF_DECLARE(u300_timer, "stericsson,u300-apptimer", u300_timer_init_of); diff --git a/drivers/clocksource/versatile.c b/drivers/clocksource/versatile.c index 220b490a8142..39725d38aede 100644 --- a/drivers/clocksource/versatile.c +++ b/drivers/clocksource/versatile.c @@ -38,7 +38,7 @@ static int __init versatile_sched_clock_init(struct device_node *node) return 0; } -CLOCKSOURCE_OF_DECLARE(vexpress, "arm,vexpress-sysreg", +TIMER_OF_DECLARE(vexpress, "arm,vexpress-sysreg", versatile_sched_clock_init); -CLOCKSOURCE_OF_DECLARE(versatile, "arm,versatile-sysreg", +TIMER_OF_DECLARE(versatile, "arm,versatile-sysreg", versatile_sched_clock_init); diff --git a/drivers/clocksource/vf_pit_timer.c b/drivers/clocksource/vf_pit_timer.c index e0849e20a307..0f92089ec08c 100644 --- a/drivers/clocksource/vf_pit_timer.c +++ b/drivers/clocksource/vf_pit_timer.c @@ -201,4 +201,4 @@ static int __init pit_timer_init(struct device_node *np) return pit_clockevent_init(clk_rate, irq); } -CLOCKSOURCE_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); +TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/clocksource/vt8500_timer.c b/drivers/clocksource/vt8500_timer.c index d02b51075ad1..e0f7489cfc8e 100644 --- a/drivers/clocksource/vt8500_timer.c +++ b/drivers/clocksource/vt8500_timer.c @@ -165,4 +165,4 @@ static int __init vt8500_timer_init(struct device_node *np) return 0; } -CLOCKSOURCE_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init); +TIMER_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init); diff --git a/drivers/clocksource/zevio-timer.c b/drivers/clocksource/zevio-timer.c index 9a53f5ef6157..a6a0338eea77 100644 --- a/drivers/clocksource/zevio-timer.c +++ b/drivers/clocksource/zevio-timer.c @@ -215,4 +215,4 @@ static int __init zevio_timer_init(struct device_node *node) return zevio_timer_add(node); } -CLOCKSOURCE_OF_DECLARE(zevio_timer, "lsi,zevio-timer", zevio_timer_init); +TIMER_OF_DECLARE(zevio_timer, "lsi,zevio-timer", zevio_timer_init); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 26b643d57847..29c5b0cbad96 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -887,7 +887,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct freq_attr *fattr = to_attr(attr); ssize_t ret = -EINVAL; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(policy->cpu)) { down_write(&policy->rwsem); @@ -895,7 +895,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, up_write(&policy->rwsem); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -2441,7 +2441,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) pr_debug("trying to register driver %s\n", driver_data->name); /* Protect against concurrent CPU online/offline. */ - get_online_cpus(); + cpus_read_lock(); write_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver) { @@ -2474,9 +2474,10 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_if_unreg; } - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online", - cpuhp_cpufreq_online, - cpuhp_cpufreq_offline); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "cpufreq:online", + cpuhp_cpufreq_online, + cpuhp_cpufreq_offline); if (ret < 0) goto err_if_unreg; hp_online = ret; @@ -2494,7 +2495,7 @@ err_null_driver: cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); out: - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(cpufreq_register_driver); @@ -2517,17 +2518,17 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) pr_debug("unregistering driver %s\n", driver->name); /* Protect against concurrent cpu hotplug */ - get_online_cpus(); + cpus_read_lock(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); - cpuhp_remove_state_nocalls(hp_online); + cpuhp_remove_state_nocalls_cpuslocked(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - put_online_cpus(); + cpus_read_unlock(); return 0; } diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 35dd4d7ffee0..b257fc7d5204 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy) * We don't support CPU hotplug. Don't unmap after the system * has already made it to a running state. */ - if (system_state != SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return 0; if (sdcasr_mapbase) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2706be7ed334..60bb64f4329d 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, entered_state = target_state->enter(dev, drv, index); start_critical_timings(); + sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 2e78b0b96d74..394db40ed374 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -112,6 +112,15 @@ config EFI_CAPSULE_LOADER Most users should say N. +config EFI_CAPSULE_QUIRK_QUARK_CSH + boolean "Add support for Quark capsules with non-standard headers" + depends on X86 && !64BIT + select EFI_CAPSULE_LOADER + default y + help + Add support for processing Quark X1000 EFI capsules, whose header + layout deviates from the layout mandated by the UEFI specification. + config EFI_TEST tristate "EFI Runtime Service Tests Support" depends on EFI diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 974c5a31a005..1cc41c3d6315 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -11,6 +11,7 @@ * */ +#include <linux/dmi.h> #include <linux/efi.h> #include <linux/io.h> #include <linux/memblock.h> @@ -166,3 +167,18 @@ void efi_virtmap_unload(void) efi_set_pgd(current->active_mm); preempt_enable(); } + + +static int __init arm_dmi_init(void) +{ + /* + * On arm64/ARM, DMI depends on UEFI, and dmi_scan_machine() needs to + * be called early because dmi_id_init(), which is an arch_initcall + * itself, depends on dmi_scan_machine() having been called already. + */ + dmi_scan_machine(); + if (dmi_available) + dmi_set_dump_stack_arch_desc(); + return 0; +} +core_initcall(arm_dmi_init); diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c index 9ae6c116c474..ec8ac5c4dd84 100644 --- a/drivers/firmware/efi/capsule-loader.c +++ b/drivers/firmware/efi/capsule-loader.c @@ -20,15 +20,9 @@ #define NO_FURTHER_WRITE_ACTION -1 -struct capsule_info { - bool header_obtained; - int reset_type; - long index; - size_t count; - size_t total_size; - struct page **pages; - size_t page_bytes_remain; -}; +#ifndef phys_to_page +#define phys_to_page(x) pfn_to_page((x) >> PAGE_SHIFT) +#endif /** * efi_free_all_buff_pages - free all previous allocated buffer pages @@ -41,65 +35,70 @@ struct capsule_info { static void efi_free_all_buff_pages(struct capsule_info *cap_info) { while (cap_info->index > 0) - __free_page(cap_info->pages[--cap_info->index]); + __free_page(phys_to_page(cap_info->pages[--cap_info->index])); cap_info->index = NO_FURTHER_WRITE_ACTION; } -/** - * efi_capsule_setup_info - obtain the efi capsule header in the binary and - * setup capsule_info structure - * @cap_info: pointer to current instance of capsule_info structure - * @kbuff: a mapped first page buffer pointer - * @hdr_bytes: the total received number of bytes for efi header - **/ -static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info, - void *kbuff, size_t hdr_bytes) +int __efi_capsule_setup_info(struct capsule_info *cap_info) { - efi_capsule_header_t *cap_hdr; size_t pages_needed; int ret; void *temp_page; - /* Only process data block that is larger than efi header size */ - if (hdr_bytes < sizeof(efi_capsule_header_t)) - return 0; - - /* Reset back to the correct offset of header */ - cap_hdr = kbuff - cap_info->count; - pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT; + pages_needed = ALIGN(cap_info->total_size, PAGE_SIZE) / PAGE_SIZE; if (pages_needed == 0) { - pr_err("%s: pages count invalid\n", __func__); + pr_err("invalid capsule size"); return -EINVAL; } /* Check if the capsule binary supported */ - ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags, - cap_hdr->imagesize, + ret = efi_capsule_supported(cap_info->header.guid, + cap_info->header.flags, + cap_info->header.imagesize, &cap_info->reset_type); if (ret) { - pr_err("%s: efi_capsule_supported() failed\n", - __func__); + pr_err("capsule not supported\n"); return ret; } - cap_info->total_size = cap_hdr->imagesize; temp_page = krealloc(cap_info->pages, pages_needed * sizeof(void *), GFP_KERNEL | __GFP_ZERO); - if (!temp_page) { - pr_debug("%s: krealloc() failed\n", __func__); + if (!temp_page) return -ENOMEM; - } cap_info->pages = temp_page; - cap_info->header_obtained = true; return 0; } /** + * efi_capsule_setup_info - obtain the efi capsule header in the binary and + * setup capsule_info structure + * @cap_info: pointer to current instance of capsule_info structure + * @kbuff: a mapped first page buffer pointer + * @hdr_bytes: the total received number of bytes for efi header + * + * Platforms with non-standard capsule update mechanisms can override + * this __weak function so they can perform any required capsule + * image munging. See quark_quirk_function() for an example. + **/ +int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes) +{ + /* Only process data block that is larger than efi header size */ + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + memcpy(&cap_info->header, kbuff, sizeof(cap_info->header)); + cap_info->total_size = cap_info->header.imagesize; + + return __efi_capsule_setup_info(cap_info); +} + +/** * efi_capsule_submit_update - invoke the efi_capsule_update API once binary * upload done * @cap_info: pointer to current instance of capsule_info structure @@ -107,26 +106,17 @@ static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info, static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) { int ret; - void *cap_hdr_temp; - - cap_hdr_temp = vmap(cap_info->pages, cap_info->index, - VM_MAP, PAGE_KERNEL); - if (!cap_hdr_temp) { - pr_debug("%s: vmap() failed\n", __func__); - return -EFAULT; - } - ret = efi_capsule_update(cap_hdr_temp, cap_info->pages); - vunmap(cap_hdr_temp); + ret = efi_capsule_update(&cap_info->header, cap_info->pages); if (ret) { - pr_err("%s: efi_capsule_update() failed\n", __func__); + pr_err("capsule update failed\n"); return ret; } /* Indicate capsule binary uploading is done */ cap_info->index = NO_FURTHER_WRITE_ACTION; - pr_info("%s: Successfully upload capsule file with reboot type '%s'\n", - __func__, !cap_info->reset_type ? "RESET_COLD" : + pr_info("Successfully upload capsule file with reboot type '%s'\n", + !cap_info->reset_type ? "RESET_COLD" : cap_info->reset_type == 1 ? "RESET_WARM" : "RESET_SHUTDOWN"); return 0; @@ -171,37 +161,30 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff, if (!cap_info->page_bytes_remain) { page = alloc_page(GFP_KERNEL); if (!page) { - pr_debug("%s: alloc_page() failed\n", __func__); ret = -ENOMEM; goto failed; } - cap_info->pages[cap_info->index++] = page; + cap_info->pages[cap_info->index++] = page_to_phys(page); cap_info->page_bytes_remain = PAGE_SIZE; + } else { + page = phys_to_page(cap_info->pages[cap_info->index - 1]); } - page = cap_info->pages[cap_info->index - 1]; - kbuff = kmap(page); - if (!kbuff) { - pr_debug("%s: kmap() failed\n", __func__); - ret = -EFAULT; - goto failed; - } kbuff += PAGE_SIZE - cap_info->page_bytes_remain; /* Copy capsule binary data from user space to kernel space buffer */ write_byte = min_t(size_t, count, cap_info->page_bytes_remain); if (copy_from_user(kbuff, buff, write_byte)) { - pr_debug("%s: copy_from_user() failed\n", __func__); ret = -EFAULT; goto fail_unmap; } cap_info->page_bytes_remain -= write_byte; /* Setup capsule binary info structure */ - if (!cap_info->header_obtained) { - ret = efi_capsule_setup_info(cap_info, kbuff, + if (cap_info->header.headersize == 0) { + ret = efi_capsule_setup_info(cap_info, kbuff - cap_info->count, cap_info->count + write_byte); if (ret) goto fail_unmap; @@ -211,11 +194,10 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff, kunmap(page); /* Submit the full binary to efi_capsule_update() API */ - if (cap_info->header_obtained && + if (cap_info->header.headersize > 0 && cap_info->count >= cap_info->total_size) { if (cap_info->count > cap_info->total_size) { - pr_err("%s: upload size exceeded header defined size\n", - __func__); + pr_err("capsule upload size exceeded header defined size\n"); ret = -EINVAL; goto failed; } @@ -249,7 +231,7 @@ static int efi_capsule_flush(struct file *file, fl_owner_t id) struct capsule_info *cap_info = file->private_data; if (cap_info->index > 0) { - pr_err("%s: capsule upload not complete\n", __func__); + pr_err("capsule upload not complete\n"); efi_free_all_buff_pages(cap_info); ret = -ECANCELED; } @@ -328,8 +310,7 @@ static int __init efi_capsule_loader_init(void) ret = misc_register(&efi_capsule_misc); if (ret) - pr_err("%s: Failed to register misc char file note\n", - __func__); + pr_err("Unable to register capsule loader device\n"); return ret; } diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c index 6eedff45e6d7..901b9306bf94 100644 --- a/drivers/firmware/efi/capsule.c +++ b/drivers/firmware/efi/capsule.c @@ -214,7 +214,7 @@ efi_capsule_update_locked(efi_capsule_header_t *capsule, * * Return 0 on success, a converted EFI status code on failure. */ -int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages) +int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages) { u32 imagesize = capsule->imagesize; efi_guid_t guid = capsule->guid; @@ -247,16 +247,13 @@ int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages) efi_capsule_block_desc_t *sglist; sglist = kmap(sg_pages[i]); - if (!sglist) { - rv = -ENOMEM; - goto out; - } for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) { - u64 sz = min_t(u64, imagesize, PAGE_SIZE); + u64 sz = min_t(u64, imagesize, + PAGE_SIZE - (u64)*pages % PAGE_SIZE); sglist[j].length = sz; - sglist[j].data = page_to_phys(*pages++); + sglist[j].data = *pages++; imagesize -= sz; count--; diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c index 8cd578f62059..08129b7b80ab 100644 --- a/drivers/firmware/efi/test/efi_test.c +++ b/drivers/firmware/efi/test/efi_test.c @@ -71,18 +71,13 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, if (!access_ok(VERIFY_READ, src, 1)) return -EFAULT; - buf = kmalloc(len, GFP_KERNEL); - if (!buf) { + buf = memdup_user(src, len); + if (IS_ERR(buf)) { *dst = NULL; - return -ENOMEM; + return PTR_ERR(buf); } *dst = buf; - if (copy_from_user(*dst, src, len)) { - kfree(buf); - return -EFAULT; - } - return 0; } diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 7b7c84369d78..7579b9702c22 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -129,7 +129,7 @@ struct drm_i915_gem_request { * It is used by the driver to then queue the request for execution. */ struct i915_sw_fence submit; - wait_queue_t submitq; + wait_queue_entry_t submitq; wait_queue_head_t execute; /* A list of everyone we wait upon, and everyone who waits upon us. diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index 474d23c0c0ce..f29540f922af 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -125,7 +125,7 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence, struct list_head *continuation) { wait_queue_head_t *x = &fence->wait; - wait_queue_t *pos, *next; + wait_queue_entry_t *pos, *next; unsigned long flags; debug_fence_deactivate(fence); @@ -133,31 +133,30 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence, /* * To prevent unbounded recursion as we traverse the graph of - * i915_sw_fences, we move the task_list from this, the next ready - * fence, to the tail of the original fence's task_list + * i915_sw_fences, we move the entry list from this, the next ready + * fence, to the tail of the original fence's entry list * (and so added to the list to be woken). */ spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation); if (continuation) { - list_for_each_entry_safe(pos, next, &x->task_list, task_list) { + list_for_each_entry_safe(pos, next, &x->head, entry) { if (pos->func == autoremove_wake_function) pos->func(pos, TASK_NORMAL, 0, continuation); else - list_move_tail(&pos->task_list, continuation); + list_move_tail(&pos->entry, continuation); } } else { LIST_HEAD(extra); do { - list_for_each_entry_safe(pos, next, - &x->task_list, task_list) + list_for_each_entry_safe(pos, next, &x->head, entry) pos->func(pos, TASK_NORMAL, 0, &extra); if (list_empty(&extra)) break; - list_splice_tail_init(&extra, &x->task_list); + list_splice_tail_init(&extra, &x->head); } while (1); } spin_unlock_irqrestore(&x->lock, flags); @@ -222,9 +221,9 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence) i915_sw_fence_complete(fence); } -static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key) +static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key) { - list_del(&wq->task_list); + list_del(&wq->entry); __i915_sw_fence_complete(wq->private, key); if (wq->flags & I915_SW_FENCE_FLAG_ALLOC) @@ -235,7 +234,7 @@ static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void * static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, const struct i915_sw_fence * const signaler) { - wait_queue_t *wq; + wait_queue_entry_t *wq; if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags)) return false; @@ -243,7 +242,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, if (fence == signaler) return true; - list_for_each_entry(wq, &fence->wait.task_list, task_list) { + list_for_each_entry(wq, &fence->wait.head, entry) { if (wq->func != i915_sw_fence_wake) continue; @@ -256,12 +255,12 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence) { - wait_queue_t *wq; + wait_queue_entry_t *wq; if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags)) return; - list_for_each_entry(wq, &fence->wait.task_list, task_list) { + list_for_each_entry(wq, &fence->wait.head, entry) { if (wq->func != i915_sw_fence_wake) continue; @@ -288,7 +287,7 @@ static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence, static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *signaler, - wait_queue_t *wq, gfp_t gfp) + wait_queue_entry_t *wq, gfp_t gfp) { unsigned long flags; int pending; @@ -318,7 +317,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, pending |= I915_SW_FENCE_FLAG_ALLOC; } - INIT_LIST_HEAD(&wq->task_list); + INIT_LIST_HEAD(&wq->entry); wq->flags = pending; wq->func = i915_sw_fence_wake; wq->private = fence; @@ -327,7 +326,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, spin_lock_irqsave(&signaler->wait.lock, flags); if (likely(!i915_sw_fence_done(signaler))) { - __add_wait_queue_tail(&signaler->wait, wq); + __add_wait_queue_entry_tail(&signaler->wait, wq); pending = 1; } else { i915_sw_fence_wake(wq, 0, 0, NULL); @@ -340,7 +339,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *signaler, - wait_queue_t *wq) + wait_queue_entry_t *wq) { return __i915_sw_fence_await_sw_fence(fence, signaler, wq, 0); } diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h index 1d3b6051daaf..fe2ef4dadfc6 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.h +++ b/drivers/gpu/drm/i915/i915_sw_fence.h @@ -65,7 +65,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence); int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *after, - wait_queue_t *wq); + wait_queue_entry_t *wq); int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence, struct i915_sw_fence *after, gfp_t gfp); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 68be1bfa22b9..5008f3d4cccc 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -377,7 +377,7 @@ struct radeon_fence { unsigned ring; bool is_vm_update; - wait_queue_t fence_wake; + wait_queue_entry_t fence_wake; }; int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index ef09f0a63754..e86f2bd38410 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -158,7 +158,7 @@ int radeon_fence_emit(struct radeon_device *rdev, * for the fence locking itself, so unlocked variants are used for * fence_signal, and remove_wait_queue. */ -static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key) +static int radeon_fence_check_signaled(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct radeon_fence *fence; u64 seq; diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index 92f1452dad57..76875f6299b8 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -417,7 +417,7 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) { struct vga_device *vgadev, *conflict; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; int rc = 0; vga_check_first_use(); diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c index a51b6b64ecdf..93ee8fc539be 100644 --- a/drivers/hwtracing/coresight/coresight-etm3x.c +++ b/drivers/hwtracing/coresight/coresight-etm3x.c @@ -587,7 +587,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -597,7 +597,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -795,7 +795,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -803,17 +803,17 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight:starting", - etm_starting_cpu, etm_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight:online", - etm_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight:starting", + etm_starting_cpu, etm_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight:online", + etm_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm_arch_supported(drvdata->arch) == false) { ret = -EINVAL; diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c index d1340fb4e457..532adc9dd32a 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.c +++ b/drivers/hwtracing/coresight/coresight-etm4x.c @@ -371,7 +371,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -381,7 +381,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -982,7 +982,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -990,18 +990,18 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm4_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight4:starting", - etm4_starting_cpu, etm4_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight4:online", - etm4_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight4:starting", + etm4_starting_cpu, etm4_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight4:online", + etm4_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm4_arch_supported(drvdata->arch) == false) { ret = -EINVAL; diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index a3f18a22f5ed..e0f47cc2effc 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1939,7 +1939,7 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev) { struct i40iw_device *iwdev; - wait_queue_t wait; + wait_queue_entry_t wait; iwdev = dev->back_dev; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 06bb504e1b1e..43f56b6dd061 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4306,7 +4306,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; - if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled) + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; atsr = container_of(hdr, struct acpi_dmar_atsr, header); @@ -4556,7 +4556,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) struct acpi_dmar_atsr *atsr; struct acpi_dmar_reserved_memory *rmrr; - if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING) + if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) return 0; list_for_each_entry(rmrru, &dmar_rmrr_units, list) { diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 19779b88a479..8cb60829a7a1 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np) * it never will be. We don't want to defer indefinitely, nor attempt * to dereference __iommu_of_table after it's been freed. */ - if (system_state > SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return false; return of_match_node(&__iommu_of_table, np); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 9b80417cd547..73da1f5626cb 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k); struct btree_op { /* for waiting on btree reserve in btree_split() */ - wait_queue_t wait; + wait_queue_entry_t wait; /* Btree level at which we start taking write locks */ short lock; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h index bed9ef17bc26..7ccffbb0019e 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h @@ -144,7 +144,7 @@ static inline int sleep_cond(wait_queue_head_t *wait_queue, int *condition) { int errno = 0; - wait_queue_t we; + wait_queue_entry_t we; init_waitqueue_entry(&we, current); add_wait_queue(wait_queue, &we); @@ -171,7 +171,7 @@ sleep_timeout_cond(wait_queue_head_t *wait_queue, int *condition, int timeout) { - wait_queue_t we; + wait_queue_entry_t we; init_waitqueue_entry(&we, current); add_wait_queue(wait_queue, &we); diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 4623155ec36e..84143a02adce 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -3066,7 +3066,7 @@ static int airo_thread(void *data) { if (ai->jobs) { locked = down_interruptible(&ai->sem); } else { - wait_queue_t wait; + wait_queue_entry_t wait; init_waitqueue_entry(&wait, current); add_wait_queue(&ai->thr_wait, &wait); diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c index b2c6b065b542..ff153ce29539 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c @@ -2544,7 +2544,7 @@ static int prism2_ioctl_priv_prism2_param(struct net_device *dev, ret = -EINVAL; } if (local->iw_mode == IW_MODE_MASTER) { - wait_queue_t __wait; + wait_queue_entry_t __wait; init_waitqueue_entry(&__wait, current); add_wait_queue(&local->hostscan_wq, &__wait); set_current_state(TASK_INTERRUPTIBLE); diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c index 55f8c997e5cb..aefa88f4f29c 100644 --- a/drivers/net/wireless/marvell/libertas/main.c +++ b/drivers/net/wireless/marvell/libertas/main.c @@ -435,7 +435,7 @@ static int lbs_thread(void *data) { struct net_device *dev = data; struct lbs_private *priv = dev->ml_priv; - wait_queue_t wait; + wait_queue_entry_t wait; init_waitqueue_entry(&wait, current); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 50f93a3bc4c0..1968e38ac160 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -320,10 +320,19 @@ static long local_pci_probe(void *_ddi) return 0; } +static bool pci_physfn_is_probed(struct pci_dev *dev) +{ +#ifdef CONFIG_PCI_IOV + return dev->is_virtfn && dev->physfn->is_probed; +#else + return false; +#endif +} + static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { - int error, node; + int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* @@ -332,33 +341,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, * on the right node. */ node = dev_to_node(&dev->dev); + dev->is_probed = 1; + + cpu_hotplug_disable(); /* - * On NUMA systems, we are likely to call a PF probe function using - * work_on_cpu(). If that probe calls pci_enable_sriov() (which - * adds the VF devices via pci_bus_add_device()), we may re-enter - * this function to call the VF probe function. Calling - * work_on_cpu() again will cause a lockdep warning. Since VFs are - * always on the same node as the PF, we can work around this by - * avoiding work_on_cpu() when we're already on the correct node. - * - * Preemption is enabled, so it's theoretically unsafe to use - * numa_node_id(), but even if we run the probe function on the - * wrong node, it should be functionally correct. + * Prevent nesting work_on_cpu() for the case where a Virtual Function + * device is probed from work_on_cpu() of the Physical device. */ - if (node >= 0 && node != numa_node_id()) { - int cpu; - - get_online_cpus(); + if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || + pci_physfn_is_probed(dev)) + cpu = nr_cpu_ids; + else cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); - if (cpu < nr_cpu_ids) - error = work_on_cpu(cpu, local_pci_probe, &ddi); - else - error = local_pci_probe(&ddi); - put_online_cpus(); - } else + + if (cpu < nr_cpu_ids) + error = work_on_cpu(cpu, local_pci_probe, &ddi); + else error = local_pci_probe(&ddi); + dev->is_probed = 0; + cpu_hotplug_enable(); return error; } diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 94f8038864b4..ed4c343d08c4 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); -int __init parse_ras_param(char *str) +static int __init parse_ras_param(char *str) { #ifdef CONFIG_RAS_CEC parse_cec_param(str); diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index 6b54f6c24c5f..80931114c899 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -709,7 +709,7 @@ static irqreturn_t dryice_irq(int irq, void *dev_id) /*If the write wait queue is empty then there is no pending operations. It means the interrupt is for DryIce -Security. IRQ must be returned as none.*/ - if (list_empty_careful(&imxdi->write_wait.task_list)) + if (list_empty_careful(&imxdi->write_wait.head)) return rc; /* DSR_WCF clears itself on DSR read */ diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index bd9e31e16249..16fc380b5512 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -48,7 +48,7 @@ #include <linux/wait.h> typedef wait_queue_head_t adpt_wait_queue_head_t; #define ADPT_DECLARE_WAIT_QUEUE_HEAD(wait) DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait) -typedef wait_queue_t adpt_wait_queue_t; +typedef wait_queue_entry_t adpt_wait_queue_entry_t; /* * message structures diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 3419e1bcdff6..67621308eb9c 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -301,13 +301,13 @@ static uint32_t ips_statupd_copperhead_memio(ips_ha_t *); static uint32_t ips_statupd_morpheus(ips_ha_t *); static ips_scb_t *ips_getscb(ips_ha_t *); static void ips_putq_scb_head(ips_scb_queue_t *, ips_scb_t *); -static void ips_putq_wait_tail(ips_wait_queue_t *, struct scsi_cmnd *); +static void ips_putq_wait_tail(ips_wait_queue_entry_t *, struct scsi_cmnd *); static void ips_putq_copp_tail(ips_copp_queue_t *, ips_copp_wait_item_t *); static ips_scb_t *ips_removeq_scb_head(ips_scb_queue_t *); static ips_scb_t *ips_removeq_scb(ips_scb_queue_t *, ips_scb_t *); -static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *); -static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *, +static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *); +static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *, struct scsi_cmnd *); static ips_copp_wait_item_t *ips_removeq_copp(ips_copp_queue_t *, ips_copp_wait_item_t *); @@ -2871,7 +2871,7 @@ ips_removeq_scb(ips_scb_queue_t * queue, ips_scb_t * item) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item) +static void ips_putq_wait_tail(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item) { METHOD_TRACE("ips_putq_wait_tail", 1); @@ -2902,7 +2902,7 @@ static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue) +static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *queue) { struct scsi_cmnd *item; @@ -2936,7 +2936,7 @@ static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *queue, +static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item) { struct scsi_cmnd *p; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index b782bb60baf0..366be3b2f9b4 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -989,7 +989,7 @@ typedef struct ips_wait_queue { struct scsi_cmnd *head; struct scsi_cmnd *tail; int count; -} ips_wait_queue_t; +} ips_wait_queue_entry_t; typedef struct ips_copp_wait_item { struct scsi_cmnd *scsi_cmd; @@ -1035,7 +1035,7 @@ typedef struct ips_ha { ips_stat_t sp; /* Status packer pointer */ struct ips_scb *scbs; /* Array of all CCBS */ struct ips_scb *scb_freelist; /* SCB free list */ - ips_wait_queue_t scb_waitlist; /* Pending SCB list */ + ips_wait_queue_entry_t scb_waitlist; /* Pending SCB list */ ips_copp_queue_t copp_waitlist; /* Pending PT list */ ips_scb_queue_t scb_activelist; /* Active SCB list */ IPS_IO_CMD *dummy; /* dummy command */ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 0db662d6abdd..85b242ec5f9b 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3267,7 +3267,7 @@ int kiblnd_connd(void *arg) { spinlock_t *lock = &kiblnd_data.kib_connd_lock; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; struct kib_conn *conn; int timeout; @@ -3521,7 +3521,7 @@ kiblnd_scheduler(void *arg) long id = (long)arg; struct kib_sched_info *sched; struct kib_conn *conn; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; struct ib_wc wc; int did_something; @@ -3656,7 +3656,7 @@ kiblnd_failover_thread(void *arg) { rwlock_t *glock = &kiblnd_data.kib_global_lock; struct kib_dev *dev; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; int rc; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 3ed3b08c122c..6b38d5a8fe92 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -2166,7 +2166,7 @@ ksocknal_connd(void *arg) { spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; struct ksock_connreq *cr; - wait_queue_t wait; + wait_queue_entry_t wait; int nloops = 0; int cons_retry = 0; @@ -2554,7 +2554,7 @@ ksocknal_check_peer_timeouts(int idx) int ksocknal_reaper(void *arg) { - wait_queue_t wait; + wait_queue_entry_t wait; struct ksock_conn *conn; struct ksock_sched *sched; struct list_head enomem_conns; diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c index c56e9922cd5b..49deb448b044 100644 --- a/drivers/staging/lustre/lnet/libcfs/debug.c +++ b/drivers/staging/lustre/lnet/libcfs/debug.c @@ -361,7 +361,7 @@ static int libcfs_debug_dumplog_thread(void *arg) void libcfs_debug_dumplog(void) { - wait_queue_t wait; + wait_queue_entry_t wait; struct task_struct *dumper; /* we're being careful to ensure that the kernel thread is diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c index 9599b7441feb..27082d2f7938 100644 --- a/drivers/staging/lustre/lnet/libcfs/tracefile.c +++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c @@ -990,7 +990,7 @@ static int tracefiled(void *arg) complete(&tctl->tctl_start); while (1) { - wait_queue_t __wait; + wait_queue_entry_t __wait; pc.pc_want_daemon_pages = 0; collect_pages(&pc); diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c index ce4b83584e17..9ebba4ef5f90 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-eq.c +++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c @@ -312,7 +312,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock) { int tms = *timeout_ms; int wait; - wait_queue_t wl; + wait_queue_entry_t wl; unsigned long now; if (!tms) diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index 9fca8d225ee0..f075706bba6d 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -516,7 +516,7 @@ lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port, int lnet_sock_accept(struct socket **newsockp, struct socket *sock) { - wait_queue_t wait; + wait_queue_entry_t wait; struct socket *newsock; int rc; diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c index 999f250ceed0..bf31bc200d27 100644 --- a/drivers/staging/lustre/lustre/fid/fid_request.c +++ b/drivers/staging/lustre/lustre/fid/fid_request.c @@ -192,7 +192,7 @@ static int seq_client_alloc_seq(const struct lu_env *env, } static int seq_fid_alloc_prep(struct lu_client_seq *seq, - wait_queue_t *link) + wait_queue_entry_t *link) { if (seq->lcs_update) { add_wait_queue(&seq->lcs_waitq, link); @@ -223,7 +223,7 @@ static void seq_fid_alloc_fini(struct lu_client_seq *seq) int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, struct lu_fid *fid) { - wait_queue_t link; + wait_queue_entry_t link; int rc; LASSERT(seq); @@ -290,7 +290,7 @@ EXPORT_SYMBOL(seq_client_alloc_fid); */ void seq_client_flush(struct lu_client_seq *seq) { - wait_queue_t link; + wait_queue_entry_t link; LASSERT(seq); init_waitqueue_entry(&link, current); diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h index b04d613846ee..f24970da8323 100644 --- a/drivers/staging/lustre/lustre/include/lustre_lib.h +++ b/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -201,7 +201,7 @@ struct l_wait_info { sigmask(SIGALRM)) /** - * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively + * wait_queue_entry_t of Linux (version < 2.6.34) is a FIFO list for exclusively * waiting threads, which is not always desirable because all threads will * be waken up again and again, even user only needs a few of them to be * active most time. This is not good for performance because cache can @@ -228,7 +228,7 @@ struct l_wait_info { */ #define __l_wait_event(wq, condition, info, ret, l_add_wait) \ do { \ - wait_queue_t __wait; \ + wait_queue_entry_t __wait; \ long __timeout = info->lwi_timeout; \ sigset_t __blocked; \ int __allow_intr = info->lwi_allow_intr; \ diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c index 8af611033e12..96515b839436 100644 --- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c +++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c @@ -207,7 +207,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md) static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) { struct lu_object_header *header = obj->co_lu.lo_header; - wait_queue_t waiter; + wait_queue_entry_t waiter; if (unlikely(atomic_read(&header->loh_ref) != 1)) { struct lu_site *site = obj->co_lu.lo_dev->ld_site; diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h index 391c632365ae..e889d3a7de9c 100644 --- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -370,7 +370,7 @@ struct lov_thread_info { struct ost_lvb lti_lvb; struct cl_2queue lti_cl2q; struct cl_page_list lti_plist; - wait_queue_t lti_waiter; + wait_queue_entry_t lti_waiter; struct cl_attr lti_attr; }; diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c index ab3ecfeeadc8..eddabbe31e5c 100644 --- a/drivers/staging/lustre/lustre/lov/lov_object.c +++ b/drivers/staging/lustre/lustre/lov/lov_object.c @@ -371,7 +371,7 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, struct lov_layout_raid0 *r0; struct lu_site *site; struct lu_site_bkt_data *bkt; - wait_queue_t *waiter; + wait_queue_entry_t *waiter; r0 = &lov->u.raid0; LASSERT(r0->lo_sub[idx] == los); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index abcf951208d2..76ae600ae2c8 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -556,7 +556,7 @@ EXPORT_SYMBOL(lu_object_print); static struct lu_object *htable_lookup(struct lu_site *s, struct cfs_hash_bd *bd, const struct lu_fid *f, - wait_queue_t *waiter, + wait_queue_entry_t *waiter, __u64 *version) { struct lu_site_bkt_data *bkt; @@ -670,7 +670,7 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, const struct lu_object_conf *conf, - wait_queue_t *waiter) + wait_queue_entry_t *waiter) { struct lu_object *o; struct lu_object *shadow; @@ -750,7 +750,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env, { struct lu_site_bkt_data *bkt; struct lu_object *obj; - wait_queue_t wait; + wait_queue_entry_t wait; while (1) { obj = lu_object_find_try(env, dev, f, conf, &wait); diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 7e947ecf15f1..529c6e3cd537 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -184,7 +184,7 @@ static void hdlcdev_exit(struct slgt_info *info); struct cond_wait { struct cond_wait *next; wait_queue_head_t q; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned int data; }; static void init_cond_wait(struct cond_wait *w, unsigned int data); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 27c89cd5d70b..4797217e5e72 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -43,7 +43,7 @@ static void virqfd_deactivate(struct virqfd *virqfd) queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); } -static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct virqfd *virqfd = container_of(wait, struct virqfd, wait); unsigned long flags = (unsigned long)key; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 042030e5a035..e4613a3c362d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -165,7 +165,7 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, add_wait_queue(wqh, &poll->wait); } -static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, +static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f55671d53f28..f72095868b93 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -31,7 +31,7 @@ struct vhost_work { struct vhost_poll { poll_table table; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct vhost_work work; unsigned long mask; struct vhost_dev *dev; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c1ec8ee80924..9e35032351a0 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -190,6 +190,7 @@ static void do_poweroff(void) { switch (system_state) { case SYSTEM_BOOTING: + case SYSTEM_SCHEDULING: orderly_poweroff(true); break; case SYSTEM_RUNNING: diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c index a493c7315e94..6cc1c15bcd84 100644 --- a/drivers/xen/mcelog.c +++ b/drivers/xen/mcelog.c @@ -408,6 +408,8 @@ static int __init xen_late_init_mcelog(void) if (ret) goto deregister; + pr_info("/dev/mcelog registered by Xen\n"); + return 0; deregister: diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index beef981aa54f..974f5346458a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -83,7 +83,7 @@ struct autofs_info { struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; /* We use the following to see what we are waiting for */ struct qstr name; u32 dev; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 24a58bf9ca72..7071895b0678 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", - (unsigned long) wq->wait_queue_token, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ @@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*mp); - mp->wait_queue_token = wq->wait_queue_token; + mp->wait_queue_entry_token = wq->wait_queue_entry_token; mp->len = wq->name.len; memcpy(mp->name, wq->name.name, wq->name.len); mp->name[wq->name.len] = '\0'; @@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*ep); - ep->wait_queue_token = wq->wait_queue_token; + ep->wait_queue_entry_token = wq->wait_queue_entry_token; ep->len = wq->name.len; memcpy(ep->name, wq->name.name, wq->name.len); ep->name[wq->name.len] = '\0'; @@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*packet); - packet->wait_queue_token = wq->wait_queue_token; + packet->wait_queue_entry_token = wq->wait_queue_entry_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; @@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; + wq->wait_queue_entry_token = autofs4_next_wait_queue; if (++autofs4_next_wait_queue == 0) autofs4_next_wait_queue = 1; wq->next = sbi->queues; @@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); /* @@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); @@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status) { struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { - if (wq->wait_queue_token == wait_queue_token) + if (wq->wait_queue_entry_token == wait_queue_entry_token) break; } diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9bf90bcc56ac..bb3a02ca9da4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -18,7 +18,7 @@ #include <linux/fscache-cache.h> #include <linux/timer.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/workqueue.h> #include <linux/security.h> @@ -97,7 +97,7 @@ struct cachefiles_cache { * backing file read tracking */ struct cachefiles_one_read { - wait_queue_t monitor; /* link into monitored waitqueue */ + wait_queue_entry_t monitor; /* link into monitored waitqueue */ struct page *back_page; /* backing file page we're waiting for */ struct page *netfs_page; /* netfs page we're going to fill */ struct fscache_retrieval *op; /* retrieval op covering this */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 41df8a27d7eb..3978b324cbca 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -204,7 +204,7 @@ wait_for_old_object: wait_queue_head_t *wq; signed long timeout = 60 * HZ; - wait_queue_t wait; + wait_queue_entry_t wait; bool requeue; /* if the object we're waiting for is queued for processing, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index afbdc418966d..18d7aa61ef0f 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -21,7 +21,7 @@ * - we use this to detect read completion of backing pages * - the caller holds the waitqueue lock */ -static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, +static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, int sync, void *_key) { struct cachefiles_one_read *monitor = @@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, } /* remove from the waitqueue */ - list_del(&wait->task_list); + list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ ASSERT(monitor->op); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4d1fcd76d022..a8693632235f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/freezer.h> #include <linux/sched/signal.h> +#include <linux/wait_bit.h> #include <asm/div64.h> #include "cifsfs.h" @@ -84,7 +84,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; diff --git a/fs/eventfd.c b/fs/eventfd.c index beac8175de19..2fb4eadaa118 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..b1c8e23ddf65 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -244,7 +244,7 @@ struct eppoll_entry { * Wait queue item that will be linked to the target file wait * queue head. */ - wait_queue_t wait; + wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; @@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } -static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } @@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) * mechanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; @@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ - list_del_init(&wait->task_list); + list_del_init(&wait->entry); } spin_lock_irqsave(&ep->lock, flags); @@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int res = 0, eavail, timed_out = 0; unsigned long flags; u64 slack = 0; - wait_queue_t wait; + wait_queue_entry_t wait; ktime_t expires, *to = NULL; if (timeout > 0) { diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 611b5408f6ec..e747b3d720ee 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) void pin_kill(struct fs_pin *p) { - wait_queue_t wait; + wait_queue_entry_t wait; if (!p) { rcu_read_unlock(); @@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p) rcu_read_unlock(); schedule(); rcu_read_lock(); - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) break; /* OK, we know p couldn't have been freed yet */ spin_lock_irq(&p->wait.lock); diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..70761d6cafcd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1891,11 +1891,11 @@ static void __wait_on_freeing_inode(struct inode *inode) wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } @@ -2038,11 +2038,11 @@ static void __inode_dio_wait(struct inode *inode) DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { - prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wait); + finish_wait(wq, &q.wq_entry); } /** diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ebad34266bcf..7d5ef3bf3f3e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2579,10 +2579,10 @@ restart: wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); goto restart; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1c0ce9c15e94..c7ebba9eb611 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -7,6 +7,7 @@ #include <linux/security.h> #include <linux/crc32.h> #include <linux/nfs_page.h> +#include <linux/wait_bit.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e43fd1270c4..9b2c38732a84 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6373,7 +6373,7 @@ struct nfs4_lock_waiter { }; static int -nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; struct cb_notify_lock_args *cbnl = key; @@ -6416,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .inode = state->inode, .owner = &owner, .notified = false }; - wait_queue_t wait; + wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index febed1217b3f..70ded52dc1dd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) } struct nilfs_segctor_wait_request { - wait_queue_t wq; + wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; @@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); - list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, - wq.task_list) { + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { wrq->err = err; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 83b506020718..038d67545d9f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -46,8 +46,8 @@ static void run_down(struct slot_map *m) spin_lock(&m->q.lock); if (m->c != -1) { for (;;) { - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (m->c == -1) @@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail_exclusive(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); if (m->c > 0) @@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m) left = -EINTR; } while (left > 0); - if (!list_empty(&wait.task_list)) - list_del(&wait.task_list); + if (!list_empty(&wait.entry)) + list_del(&wait.entry); else if (left <= 0 && waitqueue_active(&m->q)) __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); __set_current_state(TASK_RUNNING); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 39bb1e838d8d..a11d773e5ff3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s) static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; + wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); diff --git a/fs/select.c b/fs/select.c index d6c652a31e99..5b524a977d91 100644 --- a/fs/select.c +++ b/fs/select.c @@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e3d71109f51..593b022ac11b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) if (likely(!waitqueue_active(wqh))) return; - /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ wake_up_poll(wqh, POLLHUP | POLLFREE); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1d622f276e3a..6148ccd6cccf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; - wait_queue_t wq; + wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; @@ -91,7 +91,7 @@ struct userfaultfd_wake_range { unsigned long len; }; -static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; @@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, * wouldn't be enough, the smp_mb__before_spinlock is * enough to avoid an explicit smp_mb() here. */ - list_del_init(&wq->task_list); + list_del_init(&wq->entry); out: return ret; } @@ -522,13 +522,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ - if (!list_empty_careful(&uwq.wq.task_list)) { + if (!list_empty_careful(&uwq.wq.entry)) { spin_lock(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ - list_del(&uwq.wq.task_list); + list_del(&uwq.wq.entry); spin_unlock(&ctx->fault_pending_wqh.lock); } @@ -860,7 +860,7 @@ wakeup: static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; VM_BUG_ON(!spin_is_locked(&wqh->lock)); @@ -869,7 +869,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); + wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; @@ -1003,14 +1003,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.task_list + * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ - list_del(&uwq->wq.task_list); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1032,7 +1032,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.task_list, &fork_event); + list_move(&uwq->wq.entry, &fork_event); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1069,8 +1069,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!list_empty(&fork_event)) { uwq = list_first_entry(&fork_event, typeof(*uwq), - wq.task_list); - list_del(&uwq->wq.task_list); + wq.entry); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); userfaultfd_event_complete(ctx, uwq); } @@ -1747,17 +1747,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } - list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d78c853265d4..0a9e6985a0d0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -269,12 +269,12 @@ xfs_inew_wait( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (!xfs_iflags_test(ip, XFS_INEW)) break; schedule(); } while (true); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ffbfe7d8bc6d..691a342989f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -622,12 +622,12 @@ __xfs_iflock( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_isiflocked(ip)) io_schedule(); } while (!xfs_iflock_nowait(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } STATIC uint @@ -2486,11 +2486,11 @@ __xfs_iunpin_wait( xfs_iunpin(ip); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_ipincount(ip)) io_schedule(); } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } void diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h new file mode 100644 index 000000000000..50401d925290 --- /dev/null +++ b/include/asm-generic/atomic-instrumented.h @@ -0,0 +1,316 @@ +#ifndef _LINUX_ATOMIC_INSTRUMENTED_H +#define _LINUX_ATOMIC_INSTRUMENTED_H + +static __always_inline int atomic_read(const atomic_t *v) +{ + return arch_atomic_read(v); +} + +static __always_inline s64 atomic64_read(const atomic64_t *v) +{ + return arch_atomic64_read(v); +} + +static __always_inline void atomic_set(atomic_t *v, int i) +{ + arch_atomic_set(v, i); +} + +static __always_inline void atomic64_set(atomic64_t *v, s64 i) +{ + arch_atomic64_set(v, i); +} + +static __always_inline int atomic_xchg(atomic_t *v, int i) +{ + return arch_atomic_xchg(v, i); +} + +static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) +{ + return arch_atomic64_xchg(v, i); +} + +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return arch_atomic_cmpxchg(v, old, new); +} + +static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return arch_atomic64_cmpxchg(v, old, new); +} + +#ifdef arch_atomic_try_cmpxchg +#define atomic_try_cmpxchg atomic_try_cmpxchg +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return arch_atomic_try_cmpxchg(v, old, new); +} +#endif + +#ifdef arch_atomic64_try_cmpxchg +#define atomic64_try_cmpxchg atomic64_try_cmpxchg +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_atomic64_try_cmpxchg(v, old, new); +} +#endif + +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +{ + return __arch_atomic_add_unless(v, a, u); +} + + +static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return arch_atomic64_add_unless(v, a, u); +} + +static __always_inline void atomic_inc(atomic_t *v) +{ + arch_atomic_inc(v); +} + +static __always_inline void atomic64_inc(atomic64_t *v) +{ + arch_atomic64_inc(v); +} + +static __always_inline void atomic_dec(atomic_t *v) +{ + arch_atomic_dec(v); +} + +static __always_inline void atomic64_dec(atomic64_t *v) +{ + arch_atomic64_dec(v); +} + +static __always_inline void atomic_add(int i, atomic_t *v) +{ + arch_atomic_add(i, v); +} + +static __always_inline void atomic64_add(s64 i, atomic64_t *v) +{ + arch_atomic64_add(i, v); +} + +static __always_inline void atomic_sub(int i, atomic_t *v) +{ + arch_atomic_sub(i, v); +} + +static __always_inline void atomic64_sub(s64 i, atomic64_t *v) +{ + arch_atomic64_sub(i, v); +} + +static __always_inline void atomic_and(int i, atomic_t *v) +{ + arch_atomic_and(i, v); +} + +static __always_inline void atomic64_and(s64 i, atomic64_t *v) +{ + arch_atomic64_and(i, v); +} + +static __always_inline void atomic_or(int i, atomic_t *v) +{ + arch_atomic_or(i, v); +} + +static __always_inline void atomic64_or(s64 i, atomic64_t *v) +{ + arch_atomic64_or(i, v); +} + +static __always_inline void atomic_xor(int i, atomic_t *v) +{ + arch_atomic_xor(i, v); +} + +static __always_inline void atomic64_xor(s64 i, atomic64_t *v) +{ + arch_atomic64_xor(i, v); +} + +static __always_inline int atomic_inc_return(atomic_t *v) +{ + return arch_atomic_inc_return(v); +} + +static __always_inline s64 atomic64_inc_return(atomic64_t *v) +{ + return arch_atomic64_inc_return(v); +} + +static __always_inline int atomic_dec_return(atomic_t *v) +{ + return arch_atomic_dec_return(v); +} + +static __always_inline s64 atomic64_dec_return(atomic64_t *v) +{ + return arch_atomic64_dec_return(v); +} + +static __always_inline s64 atomic64_inc_not_zero(atomic64_t *v) +{ + return arch_atomic64_inc_not_zero(v); +} + +static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) +{ + return arch_atomic64_dec_if_positive(v); +} + +static __always_inline bool atomic_dec_and_test(atomic_t *v) +{ + return arch_atomic_dec_and_test(v); +} + +static __always_inline bool atomic64_dec_and_test(atomic64_t *v) +{ + return arch_atomic64_dec_and_test(v); +} + +static __always_inline bool atomic_inc_and_test(atomic_t *v) +{ + return arch_atomic_inc_and_test(v); +} + +static __always_inline bool atomic64_inc_and_test(atomic64_t *v) +{ + return arch_atomic64_inc_and_test(v); +} + +static __always_inline int atomic_add_return(int i, atomic_t *v) +{ + return arch_atomic_add_return(i, v); +} + +static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return(i, v); +} + +static __always_inline int atomic_sub_return(int i, atomic_t *v) +{ + return arch_atomic_sub_return(i, v); +} + +static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) +{ + return arch_atomic64_sub_return(i, v); +} + +static __always_inline int atomic_fetch_add(int i, atomic_t *v) +{ + return arch_atomic_fetch_add(i, v); +} + +static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) +{ + return arch_atomic64_fetch_add(i, v); +} + +static __always_inline int atomic_fetch_sub(int i, atomic_t *v) +{ + return arch_atomic_fetch_sub(i, v); +} + +static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) +{ + return arch_atomic64_fetch_sub(i, v); +} + +static __always_inline int atomic_fetch_and(int i, atomic_t *v) +{ + return arch_atomic_fetch_and(i, v); +} + +static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) +{ + return arch_atomic64_fetch_and(i, v); +} + +static __always_inline int atomic_fetch_or(int i, atomic_t *v) +{ + return arch_atomic_fetch_or(i, v); +} + +static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) +{ + return arch_atomic64_fetch_or(i, v); +} + +static __always_inline int atomic_fetch_xor(int i, atomic_t *v) +{ + return arch_atomic_fetch_xor(i, v); +} + +static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) +{ + return arch_atomic64_fetch_xor(i, v); +} + +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) +{ + return arch_atomic_sub_and_test(i, v); +} + +static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) +{ + return arch_atomic64_sub_and_test(i, v); +} + +static __always_inline bool atomic_add_negative(int i, atomic_t *v) +{ + return arch_atomic_add_negative(i, v); +} + +static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_negative(i, v); +} + +#define cmpxchg(ptr, old, new) \ +({ \ + arch_cmpxchg((ptr), (old), (new)); \ +}) + +#define sync_cmpxchg(ptr, old, new) \ +({ \ + arch_sync_cmpxchg((ptr), (old), (new)); \ +}) + +#define cmpxchg_local(ptr, old, new) \ +({ \ + arch_cmpxchg_local((ptr), (old), (new)); \ +}) + +#define cmpxchg64(ptr, old, new) \ +({ \ + arch_cmpxchg64((ptr), (old), (new)); \ +}) + +#define cmpxchg64_local(ptr, old, new) \ +({ \ + arch_cmpxchg64_local((ptr), (old), (new)); \ +}) + +#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +({ \ + arch_cmpxchg_double((p1), (p2), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ +({ \ + arch_cmpxchg_double_local((p1), (p2), (o1), (o2), (n1), (n2)); \ +}) + +#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h deleted file mode 100644 index a2508a8f9a9c..000000000000 --- a/include/asm-generic/siginfo.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_GENERIC_SIGINFO_H -#define _ASM_GENERIC_SIGINFO_H - -#include <uapi/asm-generic/siginfo.h> - -#define __SI_MASK 0xffff0000u -#define __SI_KILL (0 << 16) -#define __SI_TIMER (1 << 16) -#define __SI_POLL (2 << 16) -#define __SI_FAULT (3 << 16) -#define __SI_CHLD (4 << 16) -#define __SI_RT (5 << 16) -#define __SI_MESGQ (6 << 16) -#define __SI_SYS (7 << 16) -#define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) - -struct siginfo; -void do_schedule_next_timer(struct siginfo *info); - -extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from); - -#endif diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 9862afb3ae05..72e14a0a4700 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -172,8 +172,7 @@ KEEP(*(__##name##_of_table)) \ KEEP(*(__##name##_of_table_end)) -#define CLKSRC_OF_TABLES() OF_TABLE(CONFIG_CLKSRC_OF, clksrc) -#define CLKEVT_OF_TABLES() OF_TABLE(CONFIG_CLKEVT_OF, clkevt) +#define TIMER_OF_TABLES() OF_TABLE(CONFIG_TIMER_OF, timer) #define IRQCHIP_OF_MATCH_TABLE() OF_TABLE(CONFIG_IRQCHIP, irqchip) #define CLK_OF_TABLES() OF_TABLE(CONFIG_COMMON_CLK, clk) #define IOMMU_OF_TABLES() OF_TABLE(CONFIG_OF_IOMMU, iommu) @@ -557,15 +556,15 @@ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ - CLKSRC_OF_TABLES() \ - CLKEVT_OF_TABLES() \ + TIMER_OF_TABLES() \ IOMMU_OF_TABLES() \ CPU_METHOD_OF_TABLES() \ CPUIDLE_METHOD_OF_TABLES() \ KERNEL_DTB() \ IRQCHIP_OF_MATCH_TABLE() \ ACPI_PROBE_TABLE(irqchip) \ - ACPI_PROBE_TABLE(clksrc) \ + ACPI_PROBE_TABLE(timer) \ + ACPI_PROBE_TABLE(iort) \ EARLYCON_TABLE() #define INIT_TEXT \ diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h index 2793652fbf66..a414a2b53e41 100644 --- a/include/linux/bcm47xx_nvram.h +++ b/include/linux/bcm47xx_nvram.h @@ -8,6 +8,7 @@ #ifndef __BCM47XX_NVRAM_H #define __BCM47XX_NVRAM_H +#include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/vmalloc.h> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 23d32ff0b462..14542308d25b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -33,7 +33,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index acc9ce05e5f0..a116926598fd 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -223,13 +223,4 @@ static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ -#define CLOCKEVENT_OF_DECLARE(name, compat, fn) \ - OF_DECLARE_1_RET(clkevt, name, compat, fn) - -#ifdef CONFIG_CLKEVT_PROBE -extern int clockevent_probe(void); -#else -static inline int clockevent_probe(void) { return 0; } -#endif - #endif /* _LINUX_CLOCKCHIPS_H */ diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f2b10d9ebd04..a78cb1848e65 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -96,6 +96,7 @@ struct clocksource { void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); void (*mark_unstable)(struct clocksource *cs); + void (*tick_stable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -249,16 +250,19 @@ extern int clocksource_mmio_init(void __iomem *, const char *, extern int clocksource_i8253_init(void); +#define TIMER_OF_DECLARE(name, compat, fn) \ + OF_DECLARE_1_RET(timer, name, compat, fn) + #define CLOCKSOURCE_OF_DECLARE(name, compat, fn) \ - OF_DECLARE_1_RET(clksrc, name, compat, fn) + TIMER_OF_DECLARE(name, compat, fn) -#ifdef CONFIG_CLKSRC_PROBE -extern void clocksource_probe(void); +#ifdef CONFIG_TIMER_PROBE +extern void timer_probe(void); #else -static inline void clocksource_probe(void) {} +static inline void timer_probe(void) {} #endif -#define CLOCKSOURCE_ACPI_DECLARE(name, table_id, fn) \ - ACPI_DECLARE_PROBE_ENTRY(clksrc, name, table_id, 0, NULL, 0, fn) +#define TIMER_ACPI_DECLARE(name, table_id, fn) \ + ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c5f3152cbb5..425563c7647b 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -94,6 +94,10 @@ struct compat_itimerval { struct compat_timeval it_value; }; +struct itimerval; +int get_compat_itimerval(struct itimerval *, const struct compat_itimerval __user *); +int put_compat_itimerval(struct compat_itimerval __user *, const struct itimerval *); + struct compat_tms { compat_clock_t tms_utime; compat_clock_t tms_stime; @@ -128,6 +132,10 @@ struct compat_timex { compat_int_t:32; compat_int_t:32; compat_int_t:32; }; +struct timex; +int compat_get_timex(struct timex *, const struct compat_timex __user *); +int compat_put_timex(struct compat_timex __user *, const struct timex *); + #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { diff --git a/include/linux/compiler.h b/include/linux/compiler.h index f8110051188f..707242fdbb89 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -17,11 +17,7 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) -#ifdef CONFIG_SPARSE_RCU_POINTER # define __rcu __attribute__((noderef, address_space(4))) -#else /* CONFIG_SPARSE_RCU_POINTER */ -# define __rcu -#endif /* CONFIG_SPARSE_RCU_POINTER */ # define __private __attribute__((noderef)) extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f92081234afd..ca73bc1563f4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -99,26 +99,32 @@ static inline void cpu_maps_update_done(void) extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU -/* Stop CPUs going up and down. */ - -extern void cpu_hotplug_begin(void); -extern void cpu_hotplug_done(void); -extern void get_online_cpus(void); -extern void put_online_cpus(void); +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); +extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -#else /* CONFIG_HOTPLUG_CPU */ - -static inline void cpu_hotplug_begin(void) {} -static inline void cpu_hotplug_done(void) {} -#define get_online_cpus() do { } while (0) -#define put_online_cpus() do { } while (0) -#define cpu_hotplug_disable() do { } while (0) -#define cpu_hotplug_enable() do { } while (0) -#endif /* CONFIG_HOTPLUG_CPU */ +#else /* CONFIG_HOTPLUG_CPU */ + +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline void lockdep_assert_cpus_held(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* Wrappers which go away once all code is converted */ +static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } +static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } +static inline void get_online_cpus(void) { cpus_read_lock(); } +static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f2a80377520..df3d2719a796 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, + bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed @@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, true, startup, + teardown, false); +} + /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks @@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, false, startup, + teardown, false); +} + /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed @@ -217,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup @@ -249,7 +274,15 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +static inline int +cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance_cpuslocked(state, node, false); +} + void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown @@ -273,6 +306,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) +{ + __cpuhp_remove_state_cpuslocked(state, false); +} + /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2404ad238c0b..4bf4479a3a80 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node); (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) +extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); + +/** + * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask poiter + * @start: the start location + * + * The implementation does not assume any bit in @mask is set (including @start). + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ + (cpu) < nr_cpumask_bits; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) + /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator @@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + + /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) @@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) diff --git a/include/linux/efi.h b/include/linux/efi.h index ec36f42a2add..8269bcb8ccf7 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -137,6 +137,18 @@ struct efi_boot_memmap { #define EFI_CAPSULE_POPULATE_SYSTEM_TABLE 0x00020000 #define EFI_CAPSULE_INITIATE_RESET 0x00040000 +struct capsule_info { + efi_capsule_header_t header; + int reset_type; + long index; + size_t count; + size_t total_size; + phys_addr_t *pages; + size_t page_bytes_remain; +}; + +int __efi_capsule_setup_info(struct capsule_info *cap_info); + /* * Allocation types for calls to boottime->allocate_pages. */ @@ -1403,7 +1415,7 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset); extern int efi_capsule_update(efi_capsule_header_t *capsule, - struct page **pages); + phys_addr_t *pages); #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index ff0b981f078e..9e4befd95bc7 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -37,7 +37,7 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); #else /* CONFIG_EVENTFD */ @@ -73,7 +73,7 @@ static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, } static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, - wait_queue_t *wait, __u64 *cnt) + wait_queue_entry_t *wait, __u64 *cnt) { return -ENOSYS; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 8d886e74d9e2..39e4603cd17a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,7 +2,7 @@ #define _LINUX_FS_H #include <linux/linkage.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8c5b10eb7265..255edd5e7a74 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -452,11 +452,11 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, } /* Precise sleep: */ -extern long hrtimer_nanosleep(struct timespec64 *rqtp, - struct timespec __user *rmtp, + +extern int nanosleep_copyout(struct restart_block *, struct timespec *); +extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); -extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); diff --git a/include/linux/irq.h b/include/linux/irq.h index f887351aa80e..d996314b6522 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -22,6 +22,7 @@ #include <linux/topology.h> #include <linux/wait.h> #include <linux/io.h> +#include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> @@ -216,6 +217,7 @@ enum { IRQD_WAKEUP_ARMED = (1 << 19), IRQD_FORWARDED_TO_VCPU = (1 << 20), IRQD_AFFINITY_MANAGED = (1 << 21), + IRQD_IRQ_STARTED = (1 << 22), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -329,6 +331,11 @@ static inline void irqd_clr_activated(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_ACTIVATED; } +static inline bool irqd_is_started(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_IRQ_STARTED; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) @@ -951,6 +958,14 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); +struct irq_chip_generic * +devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, + unsigned int irq_base, void __iomem *reg_base, + irq_flow_handler_t handler); +int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, + u32 msk, enum irq_gc_flags flags, + unsigned int clr, unsigned int set); + struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, @@ -967,6 +982,19 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, handler, clr, set, flags); \ }) +static inline void irq_free_generic_chip(struct irq_chip_generic *gc) +{ + kfree(gc); +} + +static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, + u32 msk, unsigned int clr, + unsigned int set) +{ + irq_remove_generic_chip(gc, msk, clr, set); + irq_free_generic_chip(gc); +} + static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 13bc08aba704..1c91f26e2996 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -490,9 +490,13 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* Values used for system_state */ +/* + * Values used for system_state. Ordering of the states must not be changed + * as code checks for <, <=, >, >= STATE. + */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_SCHEDULING, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 0c8bd45c8206..04817b1ca019 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -108,11 +108,7 @@ static inline ktime_t timeval_to_ktime(struct timeval tv) */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { - if (cmp1 < cmp2) - return -1; - if (cmp1 > cmp2) - return 1; - return 0; + return ktime_sub(cmp1, cmp2); } /** diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 0c1de05098c8..76c2fbc59f35 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -46,7 +46,7 @@ struct kvm_kernel_irqfd_resampler { struct kvm_kernel_irqfd { /* Used for MSI fast-path */ struct kvm *kvm; - wait_queue_t wait; + wait_queue_entry_t wait; /* Update side is protected by irqfds.lock */ struct kvm_kernel_irq_routing_entry irq_entry; seqcount_t irq_entry_sc; diff --git a/include/linux/llist.h b/include/linux/llist.h index 171baa90f6f6..d11738110a7a 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list) for ((pos) = (node); pos; (pos) = (pos)->next) /** + * llist_for_each_safe - iterate over some deleted entries of a lock-less list + * safe against removal of list entry + * @pos: the &struct llist_node to use as a loop cursor + * @n: another &struct llist_node to use as temporary storage + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) + +/** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 136dfdf63ba1..fc412fbd80bd 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -14,6 +14,10 @@ #include <asm/page.h> +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +#include <asm/tlbbatch.h> +#endif + #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) @@ -67,12 +71,15 @@ struct page_frag { struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* - * Each bit set is a CPU that potentially has a TLB entry for one of - * the PFNs being flushed. See set_tlb_ubc_flush_pending(). + * The arch code makes the following promise: generic code can modify a + * PTE, then call arch_tlbbatch_add_mm() (which internally provides all + * needed barriers), then call arch_tlbbatch_flush(), and the entries + * will be flushed on all CPUs by the time that arch_tlbbatch_flush() + * returns. */ - struct cpumask cpumask; + struct arch_tlbflush_unmap_batch arch; - /* True if any bit in cpumask is set */ + /* True if a flush is needed. */ bool flush_required; /* diff --git a/include/linux/padata.h b/include/linux/padata.h index 0f9e567d5e15..2f9c1f93b1ce 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -166,9 +166,6 @@ struct padata_instance { extern struct padata_instance *padata_alloc_possible( struct workqueue_struct *wq); -extern struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 28acc94e0f81..baa9344dcd10 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -543,7 +543,7 @@ void page_endio(struct page *page, bool is_write, int err); /* * Add an arbitrary waiter to a page's wait queue */ -extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter); +extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. diff --git a/include/linux/pci.h b/include/linux/pci.h index f6302ea2dee1..fa48e20273eb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -378,6 +378,7 @@ struct pci_dev { unsigned int irq_managed:1; unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* broken BARs; ignore them */ + unsigned int is_probed:1; /* device probing in progress */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8fc5f0fada5e..a3b873fc59e4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,8 @@ struct perf_cpu_context { struct list_head sched_cb_entry; int sched_cb_usage; + + int online; }; struct perf_output_handle { diff --git a/include/linux/poll.h b/include/linux/poll.h index 75ffc5729e4c..2889f09a1c60 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -75,7 +75,7 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) struct poll_table_entry { struct file *filp; unsigned long key; - wait_queue_t wait; + wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index 83b22ae9ae12..38d8225510f1 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -42,12 +42,6 @@ struct posix_clock; * @clock_gettime: Read the current time * @clock_getres: Get the clock resolution * @clock_settime: Set the current time value - * @timer_create: Create a new timer - * @timer_delete: Remove a previously created timer - * @timer_gettime: Get remaining time and interval of a timer - * @timer_settime: Set a timer's initial expiration and interval - * @fasync: Optional character device fasync method - * @mmap: Optional character device mmap method * @open: Optional character device open method * @release: Optional character device release method * @ioctl: Optional character device ioctl method @@ -66,28 +60,12 @@ struct posix_clock_operations { int (*clock_settime)(struct posix_clock *pc, const struct timespec64 *ts); - int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit); - - int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit); - - void (*timer_gettime)(struct posix_clock *pc, - struct k_itimer *kit, struct itimerspec64 *tsp); - - int (*timer_settime)(struct posix_clock *pc, - struct k_itimer *kit, int flags, - struct itimerspec64 *tsp, struct itimerspec64 *old); /* * Optional character device methods: */ - int (*fasync) (struct posix_clock *pc, - int fd, struct file *file, int on); - long (*ioctl) (struct posix_clock *pc, unsigned int cmd, unsigned long arg); - int (*mmap) (struct posix_clock *pc, - struct vm_area_struct *vma); - int (*open) (struct posix_clock *pc, fmode_t f_mode); uint (*poll) (struct posix_clock *pc, diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8c1e43ab14a9..29f1b7f09ced 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -7,6 +7,7 @@ #include <linux/timex.h> #include <linux/alarmtimer.h> +struct siginfo; struct cpu_timer_list { struct list_head entry; @@ -48,81 +49,69 @@ struct cpu_timer_list { #define FD_TO_CLOCKID(fd) ((~(clockid_t) (fd) << 3) | CLOCKFD) #define CLOCKID_TO_FD(clk) ((unsigned int) ~((clk) >> 3)) -/* POSIX.1b interval timer structure. */ -struct k_itimer { - struct list_head list; /* free/ allocate list */ - struct hlist_node t_hash; - spinlock_t it_lock; - clockid_t it_clock; /* which timer type */ - timer_t it_id; /* timer id */ - int it_overrun; /* overrun on pending signal */ - int it_overrun_last; /* overrun on last delivered signal */ - int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 - int it_sigev_notify; /* notify word of sigevent struct */ - struct signal_struct *it_signal; + +/** + * struct k_itimer - POSIX.1b interval timer structure. + * @list: List head for binding the timer to signals->posix_timers + * @t_hash: Entry in the posix timer hash table + * @it_lock: Lock protecting the timer + * @kclock: Pointer to the k_clock struct handling this timer + * @it_clock: The posix timer clock id + * @it_id: The posix timer id for identifying the timer + * @it_active: Marker that timer is active + * @it_overrun: The overrun counter for pending signals + * @it_overrun_last: The overrun at the time of the last delivered signal + * @it_requeue_pending: Indicator that timer waits for being requeued on + * signal delivery + * @it_sigev_notify: The notify word of sigevent struct for signal delivery + * @it_interval: The interval for periodic timers + * @it_signal: Pointer to the creators signal struct + * @it_pid: The pid of the process/task targeted by the signal + * @it_process: The task to wakeup on clock_nanosleep (CPU timers) + * @sigq: Pointer to preallocated sigqueue + * @it: Union representing the various posix timer type + * internals. Also used for rcu freeing the timer. + */ +struct k_itimer { + struct list_head list; + struct hlist_node t_hash; + spinlock_t it_lock; + const struct k_clock *kclock; + clockid_t it_clock; + timer_t it_id; + int it_active; + int it_overrun; + int it_overrun_last; + int it_requeue_pending; + int it_sigev_notify; + ktime_t it_interval; + struct signal_struct *it_signal; union { - struct pid *it_pid; /* pid of process to send signal to */ - struct task_struct *it_process; /* for clock_nanosleep */ + struct pid *it_pid; + struct task_struct *it_process; }; - struct sigqueue *sigq; /* signal queue entry. */ + struct sigqueue *sigq; union { struct { - struct hrtimer timer; - ktime_t interval; + struct hrtimer timer; } real; - struct cpu_timer_list cpu; + struct cpu_timer_list cpu; struct { - unsigned int clock; - unsigned int node; - unsigned long incr; - unsigned long expires; - } mmtimer; - struct { - struct alarm alarmtimer; - ktime_t interval; + struct alarm alarmtimer; } alarm; - struct rcu_head rcu; + struct rcu_head rcu; } it; }; -struct k_clock { - int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_set) (const clockid_t which_clock, - const struct timespec64 *tp); - int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj) (const clockid_t which_clock, struct timex *tx); - int (*timer_create) (struct k_itimer *timer); - int (*nsleep) (const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); - long (*nsleep_restart) (struct restart_block *restart_block); - int (*timer_set) (struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting); - int (*timer_del) (struct k_itimer *timr); -#define TIMER_RETRY 1 - void (*timer_get) (struct k_itimer *timr, - struct itimerspec64 *cur_setting); -}; - -extern struct k_clock clock_posix_cpu; -extern struct k_clock clock_posix_dynamic; - -void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); - -/* function to call to trigger timer event */ -int posix_timer_event(struct k_itimer *timr, int si_private); - -void posix_cpu_timer_schedule(struct k_itimer *timer); - void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); -long clock_nanosleep_restart(struct restart_block *restart_block); - void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); +void posixtimer_rearm(struct siginfo *info); + #endif diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h index 4b766b61e1a0..426cee67f0e2 100644 --- a/include/linux/rcu_node_tree.h +++ b/include/linux/rcu_node_tree.h @@ -7,6 +7,10 @@ * unlimited scalability while maintaining a constant level of contention * on the root node. * + * This seemingly RCU-private file must be available to SRCU users + * because the size of the TREE SRCU srcu_struct structure depends + * on these definitions. + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index ba4d2621d9ca..c3ad00e63556 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -1,6 +1,10 @@ /* * RCU segmented callback lists * + * This seemingly RCU-private file must be available to SRCU users + * because the size of the TREE SRCU srcu_struct structure depends + * on these definitions. + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e1e5d002fdb9..f816fc72b51e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -34,104 +34,15 @@ #define __LINUX_RCUPDATE_H #include <linux/types.h> -#include <linux/cache.h> -#include <linux/spinlock.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/seqlock.h> -#include <linux/lockdep.h> -#include <linux/debugobjects.h> -#include <linux/bug.h> #include <linux/compiler.h> -#include <linux/ktime.h> +#include <linux/atomic.h> #include <linux/irqflags.h> +#include <linux/preempt.h> +#include <linux/bottom_half.h> +#include <linux/lockdep.h> +#include <asm/processor.h> +#include <linux/cpumask.h> -#include <asm/barrier.h> - -#ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* for sysctl */ -extern int rcu_normal; /* also for sysctl */ -#endif /* #ifndef CONFIG_TINY_RCU */ - -#ifdef CONFIG_TINY_RCU -/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} -#else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_gp_is_normal(void); /* Internal RCU use. */ -bool rcu_gp_is_expedited(void); /* Internal RCU use. */ -void rcu_expedite_gp(void); -void rcu_unexpedite_gp(void); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - -enum rcutorture_type { - RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, - RCU_TASKS_FLAVOR, - SRCU_FLAVOR, - INVALID_RCU_FLAVOR -}; - -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); -void rcutorture_record_test_transition(void); -void rcutorture_record_progress(unsigned long vernum); -void do_trace_rcu_torture_read(const char *rcutorturename, - struct rcu_head *rhp, - unsigned long secs, - unsigned long c_old, - unsigned long c); -bool rcu_irq_enter_disabled(void); -#else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - *flags = 0; - *gpnum = 0; - *completed = 0; -} -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} -static inline bool rcu_irq_enter_disabled(void) -{ - return false; -} -#ifdef CONFIG_RCU_TRACE -void do_trace_rcu_torture_read(const char *rcutorturename, - struct rcu_head *rhp, - unsigned long secs, - unsigned long c_old, - unsigned long c); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ - do { } while (0) -#endif -#endif - -#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) -#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) @@ -139,115 +50,14 @@ void do_trace_rcu_torture_read(const char *rcutorturename, /* Exported common interfaces */ #ifdef CONFIG_PREEMPT_RCU - -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - */ -void call_rcu(struct rcu_head *head, - rcu_callback_t func); - +void call_rcu(struct rcu_head *head, rcu_callback_t func); #else /* #ifdef CONFIG_PREEMPT_RCU */ - -/* In classic RCU, call_rcu() is just call_rcu_sched(). */ #define call_rcu call_rcu_sched - #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -/** - * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. - * OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_bh(struct rcu_head *head, - rcu_callback_t func); - -/** - * call_rcu_sched() - Queue an RCU for invocation after sched grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_sched() assumes - * that the read-side critical sections end on enabling of preemption - * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), - * OR - * anything that disables preemption. - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_sched(struct rcu_head *head, - rcu_callback_t func); - +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func); +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func); void synchronize_sched(void); - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_barrier_tasks(void); @@ -301,22 +111,12 @@ void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); void rcu_cpu_starting(unsigned int cpu); -#ifndef CONFIG_TINY_RCU -void rcu_end_inkernel_boot(void); -#else /* #ifndef CONFIG_TINY_RCU */ -static inline void rcu_end_inkernel_boot(void) { } -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ -static inline void rcu_sysrq_start(void) -{ -} -static inline void rcu_sysrq_end(void) -{ -} +static inline void rcu_sysrq_start(void) { } +static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_NO_HZ_FULL @@ -330,9 +130,7 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_init_nohz(void) -{ -} +static inline void rcu_init_nohz(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** @@ -397,10 +195,6 @@ do { \ rcu_note_voluntary_context_switch(current); \ } while (0) -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) -bool __rcu_is_watching(void); -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ - /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. @@ -414,10 +208,6 @@ bool __rcu_is_watching(void); #error "Unknown RCU implementation specified to kernel configuration" #endif -#define RCU_SCHEDULER_INACTIVE 0 -#define RCU_SCHEDULER_INIT 1 -#define RCU_SCHEDULER_RUNNING 2 - /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures @@ -430,30 +220,16 @@ void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -static inline void init_rcu_head(struct rcu_head *head) -{ -} - -static inline void destroy_rcu_head(struct rcu_head *head) -{ -} - -static inline void init_rcu_head_on_stack(struct rcu_head *head) -{ -} - -static inline void destroy_rcu_head_on_stack(struct rcu_head *head) -{ -} +static inline void init_rcu_head(struct rcu_head *head) { } +static inline void destroy_rcu_head(struct rcu_head *head) { } +static inline void init_rcu_head_on_stack(struct rcu_head *head) { } +static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ -static inline bool rcu_lockdep_current_cpu_online(void) -{ - return true; -} +static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -473,18 +249,8 @@ extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); - int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); - -/** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? - * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an - * RCU-sched read-side critical section. In absence of - * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side - * critical section unless it can prove otherwise. - */ int rcu_read_lock_sched_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -531,9 +297,7 @@ static inline void rcu_preempt_sleep_check(void) "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ -static inline void rcu_preempt_sleep_check(void) -{ -} +static inline void rcu_preempt_sleep_check(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ #define rcu_sleep_check() \ @@ -1084,52 +848,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) -#ifdef CONFIG_TINY_RCU -static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return 0; -} -#endif /* #ifdef CONFIG_TINY_RCU */ - -#if defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline bool rcu_is_nocb_cpu(int cpu) { return true; } -#elif defined(CONFIG_RCU_NOCB_CPU) -bool rcu_is_nocb_cpu(int cpu); -#else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } -#endif - - -/* Only for use by adaptive-ticks code. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE -bool rcu_sys_is_idle(void); -void rcu_sysidle_force_exit(void); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static inline bool rcu_sys_is_idle(void) -{ - return false; -} - -static inline void rcu_sysidle_force_exit(void) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - - -/* - * Dump the ftrace buffer, but only one time per callsite per boot. - */ -#define rcu_ftrace_dump(oops_dump_mode) \ -do { \ - static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ - \ - if (!atomic_read(&___rfd_beenhere) && \ - !atomic_xchg(&___rfd_beenhere, 1)) \ - ftrace_dump(oops_dump_mode); \ -} while (0) /* * Place this after a lock-acquisition primitive to guarantee that diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 74d9c3a1feee..5becbbccb998 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -25,7 +25,7 @@ #ifndef __LINUX_TINY_H #define __LINUX_TINY_H -#include <linux/cache.h> +#include <linux/ktime.h> struct rcu_dynticks; static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) @@ -33,10 +33,8 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) return 0; } -static inline bool rcu_eqs_special_set(int cpu) -{ - return false; /* Never flag non-existent other CPUs! */ -} +/* Never flag non-existent other CPUs! */ +static inline bool rcu_eqs_special_set(int cpu) { return false; } static inline unsigned long get_state_synchronize_rcu(void) { @@ -98,159 +96,38 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_note_voluntary_context_switch_lite(current); \ } while (0) -/* - * Take advantage of the fact that there is only one CPU, which - * allows us to ignore virtualization-based context switches. - */ -static inline void rcu_virt_note_context_switch(int cpu) -{ -} - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) { + *nextevt = KTIME_MAX; return 0; } /* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. + * Take advantage of the fact that there is only one CPU, which + * allows us to ignore virtualization-based context switches. */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - -static inline void rcu_cpu_stall_reset(void) -{ -} - -static inline void rcu_idle_enter(void) -{ -} - -static inline void rcu_idle_exit(void) -{ -} - -static inline void rcu_irq_enter(void) -{ -} - -static inline void rcu_irq_exit_irqson(void) -{ -} - -static inline void rcu_irq_enter_irqson(void) -{ -} - -static inline void rcu_irq_exit(void) -{ -} - -static inline void exit_rcu(void) -{ -} +static inline void rcu_virt_note_context_switch(int cpu) { } +static inline void rcu_cpu_stall_reset(void) { } +static inline void rcu_idle_enter(void) { } +static inline void rcu_idle_exit(void) { } +static inline void rcu_irq_enter(void) { } +static inline bool rcu_irq_enter_disabled(void) { return false; } +static inline void rcu_irq_exit_irqson(void) { } +static inline void rcu_irq_enter_irqson(void) { } +static inline void rcu_irq_exit(void) { } +static inline void exit_rcu(void) { } #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) extern int rcu_scheduler_active __read_mostly; void rcu_scheduler_starting(void); #else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ -static inline void rcu_scheduler_starting(void) -{ -} +static inline void rcu_scheduler_starting(void) { } #endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ +static inline void rcu_end_inkernel_boot(void) { } +static inline bool rcu_is_watching(void) { return true; } -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -static inline bool rcu_is_watching(void) -{ - return __rcu_is_watching(); -} - -#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - -static inline bool rcu_is_watching(void) -{ - return true; -} - -#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - -static inline void rcu_request_urgent_qs_task(struct task_struct *t) -{ -} - -static inline void rcu_all_qs(void) -{ - barrier(); /* Avoid RCU read-side critical sections leaking across. */ -} +/* Avoid RCU read-side critical sections leaking across. */ +static inline void rcu_all_qs(void) { barrier(); } /* RCUtree hotplug events */ #define rcutree_prepare_cpu NULL diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 0bacb6b2af69..37d6fd3b7ff8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -79,37 +79,20 @@ void cond_synchronize_rcu(unsigned long oldstate); unsigned long get_state_synchronize_sched(void); void cond_synchronize_sched(unsigned long oldstate); -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); -unsigned long rcu_exp_batches_completed(void); -unsigned long rcu_exp_batches_completed_sched(void); -void show_rcu_gp_kthreads(void); - -void rcu_force_quiescent_state(void); -void rcu_bh_force_quiescent_state(void); -void rcu_sched_force_quiescent_state(void); - void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); +bool rcu_irq_enter_disabled(void); void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; - +void rcu_end_inkernel_boot(void); bool rcu_is_watching(void); -void rcu_request_urgent_qs_task(struct task_struct *t); - void rcu_all_qs(void); /* RCUtree hotplug events */ diff --git a/include/linux/refcount.h b/include/linux/refcount.h index b34aa649d204..bb71f2871dac 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -41,6 +41,7 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } +#ifdef CONFIG_REFCOUNT_FULL extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r); extern void refcount_add(unsigned int i, refcount_t *r); @@ -52,6 +53,47 @@ extern void refcount_sub(unsigned int i, refcount_t *r); extern __must_check bool refcount_dec_and_test(refcount_t *r); extern void refcount_dec(refcount_t *r); +#else +static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + return atomic_add_unless(&r->refs, i, 0); +} + +static inline void refcount_add(unsigned int i, refcount_t *r) +{ + atomic_add(i, &r->refs); +} + +static inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return atomic_add_unless(&r->refs, 1, 0); +} + +static inline void refcount_inc(refcount_t *r) +{ + atomic_inc(&r->refs); +} + +static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + return atomic_sub_and_test(i, &r->refs); +} + +static inline void refcount_sub(unsigned int i, refcount_t *r) +{ + atomic_sub(i, &r->refs); +} + +static inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + return atomic_dec_and_test(&r->refs); +} + +static inline void refcount_dec(refcount_t *r) +{ + atomic_dec(&r->refs); +} +#endif /* CONFIG_REFCOUNT_FULL */ extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 0d905d8ec553..19df8422606c 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -11,6 +11,14 @@ struct timespec; struct compat_timespec; struct pollfd; +enum timespec_type { + TT_NONE = 0, + TT_NATIVE = 1, +#ifdef CONFIG_COMPAT + TT_COMPAT = 2, +#endif +}; + /* * System call restart block. */ @@ -29,10 +37,13 @@ struct restart_block { /* For nanosleep */ struct { clockid_t clockid; - struct timespec __user *rmtp; + enum timespec_type type; + union { + struct timespec __user *rmtp; #ifdef CONFIG_COMPAT - struct compat_timespec __user *compat_rmtp; + struct compat_timespec __user *compat_rmtp; #endif + }; u64 expires; } nanosleep; /* For poll */ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 1abba5ce2a2f..44fd002f7cd5 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -37,6 +37,9 @@ struct rt_mutex { int line; void *magic; #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif }; struct rt_mutex_waiter; @@ -58,19 +61,33 @@ struct hrtimer_sleeper; #ifdef CONFIG_DEBUG_RT_MUTEXES # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ , .name = #mutexname, .file = __FILE__, .line = __LINE__ -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__) + +# define rt_mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + __rt_mutex_init(mutex, __func__, &__key); \ +} while (0) + extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL) +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) # define rt_mutex_debug_task_free(t) do { } while (0) #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ + , .dep_map = { .name = #mutexname } +#else +#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) +#endif + #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ , .waiters = RB_ROOT \ , .owner = NULL \ - __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) @@ -86,7 +103,7 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock) return lock->owner != NULL; } -extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void rt_mutex_destroy(struct rt_mutex *lock); extern void rt_mutex_lock(struct rt_mutex *lock); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..9c4ca7433d9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -421,7 +421,8 @@ struct sched_dl_entity { u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ - u64 dl_bw; /* dl_runtime / dl_deadline */ + u64 dl_bw; /* dl_runtime / dl_period */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, @@ -445,16 +446,33 @@ struct sched_dl_entity { * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. + * + * @dl_non_contending tells if the task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. */ int dl_throttled; int dl_boosted; int dl_yielded; + int dl_non_contending; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; + + /* + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. + */ + struct hrtimer inactive_timer; }; union rcu_special { @@ -1096,8 +1114,6 @@ static inline struct pid *task_session(struct task_struct *task) * current. * task_xid_nr_ns() : id seen from the ns specified; * - * set_task_vxid() : assigns a virtual id to a task; - * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); @@ -1265,6 +1281,16 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +static inline bool is_percpu_thread(void) +{ +#ifdef CONFIG_SMP + return (current->flags & PF_NO_SETAFFINITY) && + (current->nr_cpus_allowed == 1); +#else + return true; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 34fe92ce1ebd..a55600ffdf4b 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init_late(void) -{ -} - static inline void sched_clock_tick(void) { } @@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void) { } -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +static inline void sched_clock_idle_wakeup_event(void) { } @@ -53,7 +49,6 @@ static inline u64 local_clock(void) return sched_clock(); } #else -extern void sched_clock_init_late(void); extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); @@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void); */ extern u64 __sched_clock_offset; - extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); +extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 4995b717500b..7d3f75db23e5 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -23,11 +23,11 @@ static inline void set_cpu_sd_state_idle(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -void calc_load_enter_idle(void); -void calc_load_exit_idle(void); +void calc_load_nohz_start(void); +void calc_load_nohz_stop(void); #else -static inline void calc_load_enter_idle(void) { } -static inline void calc_load_exit_idle(void) { } +static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a978d7189cfd..f0f065c5afcf 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -95,8 +95,6 @@ static inline void put_task_struct(struct task_struct *t) } struct task_struct *task_rcu_dereference(struct task_struct **ptask); -struct task_struct *try_get_task_struct(struct task_struct **ptask); - #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/include/linux/signal.h b/include/linux/signal.h index 1f5a16620693..a39feddd71ba 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -3,16 +3,13 @@ #include <linux/bug.h> #include <linux/signal_types.h> +#include <linux/string.h> struct task_struct; /* for sysctl */ extern int print_fatal_signals; -#ifndef HAVE_ARCH_COPY_SIGINFO - -#include <linux/string.h> - static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) { if (from->si_code < 0) @@ -22,7 +19,7 @@ static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); } -#endif +int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from); /* * Define some primitives to manipulate sigset_t. diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 59248dcc6ef3..d9510e8522d4 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -369,6 +369,26 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock) raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) +/** + * spin_unlock_wait - Interpose between successive critical sections + * @lock: the spinlock whose critical sections are to be interposed. + * + * Semantically this is equivalent to a spin_lock() immediately + * followed by a spin_unlock(). However, most architectures have + * more efficient implementations in which the spin_unlock_wait() + * cannot block concurrent lock acquisition, and in some cases + * where spin_unlock_wait() does not write to the lock variable. + * Nevertheless, spin_unlock_wait() can have high overhead, so if + * you feel the need to use it, please check to see if there is + * a better way to get your job done. + * + * The ordering guarantees provided by spin_unlock_wait() are: + * + * 1. All accesses preceding the spin_unlock_wait() happen before + * any accesses in later critical sections for this same lock. + * 2. All accesses following the spin_unlock_wait() happen after + * any accesses in earlier critical sections for this same lock. + */ static __always_inline void spin_unlock_wait(spinlock_t *lock) { raw_spin_unlock_wait(&lock->rlock); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4c1d5f7e62c4..39af9bc0f653 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,32 +60,15 @@ int init_srcu_struct(struct srcu_struct *sp); #include <linux/srcutiny.h> #elif defined(CONFIG_TREE_SRCU) #include <linux/srcutree.h> -#elif defined(CONFIG_CLASSIC_SRCU) -#include <linux/srcuclassic.h> -#else +#elif defined(CONFIG_SRCU) #error "Unknown SRCU implementation specified to kernel configuration" +#else +/* Dummy definition for things like notifiers. Actual use gets link error. */ +struct srcu_struct { }; #endif -/** - * call_srcu() - Queue a callback for invocation after an SRCU grace period - * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. - * @func: function to be invoked after the SRCU grace period - * - * The callback function will be invoked some time after a full SRCU - * grace period elapses, in other words after all pre-existing SRCU - * read-side critical sections have completed. However, the callback - * function might well execute concurrently with other SRCU read-side - * critical sections that started after call_srcu() was invoked. SRCU - * read-side critical sections are delimited by srcu_read_lock() and - * srcu_read_unlock(), and may be nested. - * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. - */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)); - void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h deleted file mode 100644 index 5753f7322262..000000000000 --- a/include/linux/srcuclassic.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion, - * classic v4.11 variant. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2017 - * - * Author: Paul McKenney <paulmck@us.ibm.com> - */ - -#ifndef _LINUX_SRCU_CLASSIC_H -#define _LINUX_SRCU_CLASSIC_H - -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - -struct srcu_struct { - unsigned long completed; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; - -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) - -void synchronize_srcu_expedited(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); - -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->completed; - *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) - (*gpnum)++; -} - -#endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 42311ee0334f..cfbfc540cafc 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -27,15 +27,14 @@ #include <linux/swait.h> struct srcu_struct { - int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + short srcu_idx; /* Current reader array element. */ + u8 srcu_gp_running; /* GP workqueue running? */ + u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; /* Last srcu_read_unlock() wakes GP. */ - unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ - struct rcu_segcblist srcu_cblist; - /* Pending SRCU callbacks. */ - int srcu_idx; /* Current reader array element. */ - bool srcu_gp_running; /* GP workqueue running? */ - bool srcu_gp_waiting; /* GP waiting for readers? */ + struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */ + struct rcu_head **srcu_cb_tail; /* Pending callbacks: Tail. */ struct work_struct srcu_work; /* For driving grace periods. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -47,7 +46,7 @@ void srcu_drive_gp(struct work_struct *wp); #define __SRCU_STRUCT_INIT(name) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_cb_tail = &name.srcu_cb_head, \ .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ __SRCU_DEP_MAP_INIT(name) \ } @@ -63,31 +62,29 @@ void srcu_drive_gp(struct work_struct *wp); void synchronize_srcu(struct srcu_struct *sp); -static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Can be invoked from irq/bh handlers, but the matching + * __srcu_read_unlock() must be in the same handler instance. Returns an + * index that must be passed to the matching srcu_read_unlock(). + */ +static inline int __srcu_read_lock(struct srcu_struct *sp) { - synchronize_srcu(sp); -} + int idx; -static inline void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); + idx = READ_ONCE(sp->srcu_idx); + WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + return idx; } -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +static inline void synchronize_srcu_expedited(struct srcu_struct *sp) { - return 0; + synchronize_srcu(sp); } -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) +static inline void srcu_barrier(struct srcu_struct *sp) { - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->srcu_gp_seq; - *gpnum = *completed; + synchronize_srcu(sp); } #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 32e86d85fd11..42973f787e7e 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -40,7 +40,7 @@ struct srcu_data { unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ /* Update-side state. */ - spinlock_t lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -58,7 +58,7 @@ struct srcu_data { * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { - spinlock_t lock; + raw_spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children */ /* having CBs, but only */ /* is > ->srcu_gq_seq. */ @@ -78,7 +78,7 @@ struct srcu_struct { struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t gp_lock; /* protect ->srcu_cblist */ + raw_spinlock_t __private lock; /* Protect counters */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -109,7 +109,7 @@ void process_srcu(struct work_struct *work); #define __SRCU_STRUCT_INIT(name) \ { \ .sda = &name##_srcu_data, \ - .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = 0 - 1, \ __SRCU_DEP_MAP_INIT(name) \ } @@ -141,10 +141,5 @@ void process_srcu(struct work_struct *work); void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); - -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); #endif diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3cc9632dcc2a..3d60275e3ba9 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to - * grabbing every spinlock in the kernel. */ + * grabbing every spinlock in the kernel. + * + * Protects against CPU hotplug. + */ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); +/** + * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * Same as above. Must be called from with in a cpus_read_lock() protected + * region. Avoids nested calls to cpus_read_lock(). + */ +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); + int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -static inline int stop_machine(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { unsigned long flags; int ret; @@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data, return ret; } +static inline int stop_machine(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) +{ + return stop_machine_cpuslocked(fn, data, cpus); +} + static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ed60253abd0a..50a99a117da7 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -13,7 +13,7 @@ #include <linux/ktime.h> #include <linux/sunrpc/types.h> #include <linux/spinlock.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/workqueue.h> #include <linux/sunrpc/xdr.h> diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index f7043ccca81c..0a0a53daf2a2 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -51,7 +51,7 @@ struct tk_read_base { * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_time: Monotonic raw base time in timespec64 format + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -93,7 +93,7 @@ struct timekeeper { unsigned int clock_was_set_seq; u8 cs_was_changed_seq; ktime_t next_leap_ktime; - struct timespec64 raw_time; + u64 raw_sec; /* The following members are for timekeeping internal use */ u64 cycle_interval; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index edf9b2cad277..f57076b958b7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -183,7 +183,7 @@ struct virqfd { void (*thread)(void *, void *); void *data; struct work_struct inject; - wait_queue_t wait; + wait_queue_entry_t wait; poll_table pt; struct work_struct shutdown; struct virqfd **pvirqfd; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d84ae90ccd5c..be3ab2d13adf 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif #endif #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ -#endif /* CONFIG_SMP */ NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ diff --git a/include/linux/wait.h b/include/linux/wait.h index db076ca7f11d..b289c96151ee 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -10,38 +10,30 @@ #include <asm/current.h> #include <uapi/linux/wait.h> -typedef struct __wait_queue wait_queue_t; -typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); -int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +typedef struct wait_queue_entry wait_queue_entry_t; -/* __wait_queue::flags */ +typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); +int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); + +/* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 -struct __wait_queue { +/* + * A single wait-queue entry structure: + */ +struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; - struct list_head task_list; -}; - -struct wait_bit_key { - void *flags; - int bit_nr; -#define WAIT_ATOMIC_T_BIT_NR -1 - unsigned long timeout; + struct list_head entry; }; -struct wait_bit_queue { - struct wait_bit_key key; - wait_queue_t wait; -}; - -struct __wait_queue_head { +struct wait_queue_head { spinlock_t lock; - struct list_head task_list; + struct list_head head; }; -typedef struct __wait_queue_head wait_queue_head_t; +typedef struct wait_queue_head wait_queue_head_t; struct task_struct; @@ -49,82 +41,76 @@ struct task_struct; * Macros for declaration and initialisaton of the datatypes */ -#define __WAITQUEUE_INITIALIZER(name, tsk) { \ - .private = tsk, \ - .func = default_wake_function, \ - .task_list = { NULL, NULL } } +#define __WAITQUEUE_INITIALIZER(name, tsk) { \ + .private = tsk, \ + .func = default_wake_function, \ + .entry = { NULL, NULL } } -#define DECLARE_WAITQUEUE(name, tsk) \ - wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk) +#define DECLARE_WAITQUEUE(name, tsk) \ + struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) -#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .task_list = { &(name).task_list, &(name).task_list } } +#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .head = { &(name).head, &(name).head } } #define DECLARE_WAIT_QUEUE_HEAD(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) - -#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ - { .flags = word, .bit_nr = bit, } + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) -#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ - { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } +extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); -extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *); - -#define init_waitqueue_head(q) \ - do { \ - static struct lock_class_key __key; \ - \ - __init_waitqueue_head((q), #q, &__key); \ +#define init_waitqueue_head(wq_head) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif -static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) +static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { - q->flags = 0; - q->private = p; - q->func = default_wake_function; + wq_entry->flags = 0; + wq_entry->private = p; + wq_entry->func = default_wake_function; } static inline void -init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) +init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { - q->flags = 0; - q->private = NULL; - q->func = func; + wq_entry->flags = 0; + wq_entry->private = NULL; + wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue - * @q: the waitqueue to test for waiters + * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * - * Use either while holding wait_queue_head_t::lock or when used for wakeups + * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like: * * CPU0 - waker CPU1 - waiter * * for (;;) { - * @cond = true; prepare_to_wait(&wq, &wait, state); + * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() - * if (waitqueue_active(wq)) if (@cond) - * wake_up(wq); break; + * if (waitqueue_active(wq_head)) if (@cond) + * wake_up(wq_head); break; * schedule(); * } - * finish_wait(&wq, &wait); + * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll @@ -133,20 +119,20 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ -static inline int waitqueue_active(wait_queue_head_t *q) +static inline int waitqueue_active(struct wait_queue_head *wq_head) { - return !list_empty(&q->task_list); + return !list_empty(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes - * @wq: wait queue head + * @wq_head: wait queue head * - * Returns true if wq has waiting processes + * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ -static inline bool wq_has_sleeper(wait_queue_head_t *wq) +static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the @@ -156,63 +142,51 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq) * waiting side. */ smp_mb(); - return waitqueue_active(wq); + return waitqueue_active(wq_head); } -extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); -extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); -extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); +extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); -static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) +static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add(&new->task_list, &head->task_list); + list_add(&wq_entry->entry, &wq_head->head); } /* * Used for wake-one threads: */ static inline void -__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq_head, wq_entry); } -static inline void __add_wait_queue_tail(wait_queue_head_t *head, - wait_queue_t *new) +static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add_tail(&new->task_list, &head->task_list); + list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void -__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void -__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) +__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_del(&old->task_list); + list_del(&wq_entry->entry); } -typedef int wait_bit_action_f(struct wait_bit_key *, int mode); -void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); -void __wake_up_bit(wait_queue_head_t *, void *, int); -int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); -int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); -void wake_up_bit(void *, int); -void wake_up_atomic_t(atomic_t *); -int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); -int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long); -int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); -int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); -wait_queue_head_t *bit_waitqueue(void *, int); +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -228,28 +202,28 @@ wait_queue_head_t *bit_waitqueue(void *, int); /* * Wakeup macros to be used to report events to the targets. */ -#define wake_up_poll(x, m) \ +#define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, (void *) (m)) -#define wake_up_locked_poll(x, m) \ +#define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) -#define wake_up_interruptible_poll(x, m) \ +#define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) -#define wake_up_interruptible_sync_poll(x, m) \ +#define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) -#define ___wait_cond_timeout(condition) \ -({ \ - bool __cond = (condition); \ - if (__cond && !__ret) \ - __ret = 1; \ - __cond || !__ret; \ +#define ___wait_cond_timeout(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ }) -#define ___wait_is_interruptible(state) \ - (!__builtin_constant_p(state) || \ - state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +#define ___wait_is_interruptible(state) \ + (!__builtin_constant_p(state) || \ + state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ -extern void init_wait_entry(wait_queue_t *__wait, int flags); +extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret @@ -263,108 +237,108 @@ extern void init_wait_entry(wait_queue_t *__wait, int flags); * otherwise. */ -#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ -({ \ - __label__ __out; \ - wait_queue_t __wait; \ - long __ret = ret; /* explicit shadow */ \ - \ - init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ - for (;;) { \ - long __int = prepare_to_wait_event(&wq, &__wait, state);\ - \ - if (condition) \ - break; \ - \ - if (___wait_is_interruptible(state) && __int) { \ - __ret = __int; \ - goto __out; \ - } \ - \ - cmd; \ - } \ - finish_wait(&wq, &__wait); \ -__out: __ret; \ +#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + struct wait_queue_entry __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ +__out: __ret; \ }) -#define __wait_event(wq, condition) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __wait_event(wq_head, condition) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. + * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ -#define wait_event(wq, condition) \ -do { \ - might_sleep(); \ - if (condition) \ - break; \ - __wait_event(wq, condition); \ +#define wait_event(wq_head, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_event(wq_head, condition); \ } while (0) -#define __io_wait_event(wq, condition) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __io_wait_event(wq_head, condition) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ -#define io_wait_event(wq, condition) \ -do { \ - might_sleep(); \ - if (condition) \ - break; \ - __io_wait_event(wq, condition); \ +#define io_wait_event(wq_head, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __io_wait_event(wq_head, condition); \ } while (0) -#define __wait_event_freezable(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ +#define __wait_event_freezable(wq_head, condition) \ + ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule(); try_to_freeze()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The - * @condition is checked each time the waitqueue @wq is woken up. + * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ -#define wait_event_freezable(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_freezable(wq, condition); \ - __ret; \ +#define wait_event_freezable(wq_head, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable(wq_head, condition); \ + __ret; \ }) -#define __wait_event_timeout(wq, condition, timeout) \ - ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_UNINTERRUPTIBLE, 0, timeout, \ +#define __wait_event_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. + * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -375,83 +349,83 @@ do { \ * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ -#define wait_event_timeout(wq, condition, timeout) \ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_timeout(wq, condition, timeout); \ - __ret; \ +#define wait_event_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_timeout(wq_head, condition, timeout); \ + __ret; \ }) -#define __wait_event_freezable_timeout(wq, condition, timeout) \ - ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, timeout, \ +#define __wait_event_freezable_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret); try_to_freeze()) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ -#define wait_event_freezable_timeout(wq, condition, timeout) \ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_freezable_timeout(wq, condition, timeout); \ - __ret; \ +#define wait_event_freezable_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ + __ret; \ }) -#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ +#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ -#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ -do { \ - if (condition) \ - break; \ - __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) -#define __wait_event_cmd(wq, condition, cmd1, cmd2) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. + * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ -#define wait_event_cmd(wq, condition, cmd1, cmd2) \ -do { \ - if (condition) \ - break; \ - __wait_event_cmd(wq, condition, cmd1, cmd2); \ +#define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) -#define __wait_event_interruptible(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ +#define __wait_event_interruptible(wq_head, condition) \ + ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -459,29 +433,29 @@ do { \ * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_interruptible(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_interruptible(wq, condition); \ - __ret; \ +#define wait_event_interruptible(wq_head, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_interruptible(wq_head, condition); \ + __ret; \ }) -#define __wait_event_interruptible_timeout(wq, condition, timeout) \ - ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, timeout, \ +#define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -493,50 +467,49 @@ do { \ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ -#define wait_event_interruptible_timeout(wq, condition, timeout) \ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_interruptible_timeout(wq, \ - condition, timeout); \ - __ret; \ +#define wait_event_interruptible_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_interruptible_timeout(wq_head, \ + condition, timeout); \ + __ret; \ }) -#define __wait_event_hrtimeout(wq, condition, timeout, state) \ -({ \ - int __ret = 0; \ - struct hrtimer_sleeper __t; \ - \ - hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ - HRTIMER_MODE_REL); \ - hrtimer_init_sleeper(&__t, current); \ - if ((timeout) != KTIME_MAX) \ - hrtimer_start_range_ns(&__t.timer, timeout, \ - current->timer_slack_ns, \ - HRTIMER_MODE_REL); \ - \ - __ret = ___wait_event(wq, condition, state, 0, 0, \ - if (!__t.task) { \ - __ret = -ETIME; \ - break; \ - } \ - schedule()); \ - \ - hrtimer_cancel(&__t.timer); \ - destroy_hrtimer_on_stack(&__t.timer); \ - __ret; \ +#define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ +({ \ + int __ret = 0; \ + struct hrtimer_sleeper __t; \ + \ + hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \ + hrtimer_init_sleeper(&__t, current); \ + if ((timeout) != KTIME_MAX) \ + hrtimer_start_range_ns(&__t.timer, timeout, \ + current->timer_slack_ns, \ + HRTIMER_MODE_REL); \ + \ + __ret = ___wait_event(wq_head, condition, state, 0, 0, \ + if (!__t.task) { \ + __ret = -ETIME; \ + break; \ + } \ + schedule()); \ + \ + hrtimer_cancel(&__t.timer); \ + destroy_hrtimer_on_stack(&__t.timer); \ + __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -544,25 +517,25 @@ do { \ * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ -#define wait_event_hrtimeout(wq, condition, timeout) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_hrtimeout(wq, condition, timeout, \ - TASK_UNINTERRUPTIBLE); \ - __ret; \ +#define wait_event_hrtimeout(wq_head, condition, timeout) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ + TASK_UNINTERRUPTIBLE); \ + __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -570,73 +543,73 @@ do { \ * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ -#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ -({ \ - long __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_hrtimeout(wq, condition, timeout, \ - TASK_INTERRUPTIBLE); \ - __ret; \ +#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ +({ \ + long __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq, condition, timeout, \ + TASK_INTERRUPTIBLE); \ + __ret; \ }) -#define __wait_event_interruptible_exclusive(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ +#define __wait_event_interruptible_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) -#define wait_event_interruptible_exclusive(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_interruptible_exclusive(wq, condition);\ - __ret; \ +#define wait_event_interruptible_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_interruptible_exclusive(wq, condition); \ + __ret; \ }) -#define __wait_event_killable_exclusive(wq, condition) \ - ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ +#define __wait_event_killable_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) -#define wait_event_killable_exclusive(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_killable_exclusive(wq, condition); \ - __ret; \ +#define wait_event_killable_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_killable_exclusive(wq, condition); \ + __ret; \ }) -#define __wait_event_freezable_exclusive(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ +#define __wait_event_freezable_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule(); try_to_freeze()) -#define wait_event_freezable_exclusive(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_freezable_exclusive(wq, condition);\ - __ret; \ +#define wait_event_freezable_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable_exclusive(wq, condition); \ + __ret; \ }) -extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *); -extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); - -#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ -({ \ - int __ret; \ - DEFINE_WAIT(__wait); \ - if (exclusive) \ - __wait.flags |= WQ_FLAG_EXCLUSIVE; \ - do { \ - __ret = fn(&(wq), &__wait); \ - if (__ret) \ - break; \ - } while (!(condition)); \ - __remove_wait_queue(&(wq), &__wait); \ - __set_current_state(TASK_RUNNING); \ - __ret; \ +extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); +extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); + +#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ +({ \ + int __ret; \ + DEFINE_WAIT(__wait); \ + if (exclusive) \ + __wait.flags |= WQ_FLAG_EXCLUSIVE; \ + do { \ + __ret = fn(&(wq), &__wait); \ + if (__ret) \ + break; \ + } while (!(condition)); \ + __remove_wait_queue(&(wq), &__wait); \ + __set_current_state(TASK_RUNNING); \ + __ret; \ }) @@ -663,8 +636,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_locked(wq, condition) \ - ((condition) \ +#define wait_event_interruptible_locked(wq, condition) \ + ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** @@ -690,8 +663,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_locked_irq(wq, condition) \ - ((condition) \ +#define wait_event_interruptible_locked_irq(wq, condition) \ + ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** @@ -721,8 +694,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_exclusive_locked(wq, condition) \ - ((condition) \ +#define wait_event_interruptible_exclusive_locked(wq, condition) \ + ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** @@ -752,12 +725,12 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ - ((condition) \ +#define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ + ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) -#define __wait_event_killable(wq, condition) \ +#define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** @@ -775,21 +748,21 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ -#define wait_event_killable(wq, condition) \ -({ \ - int __ret = 0; \ - might_sleep(); \ - if (!(condition)) \ - __ret = __wait_event_killable(wq, condition); \ - __ret; \ +#define wait_event_killable(wq_head, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_killable(wq_head, condition); \ + __ret; \ }) -#define __wait_event_lock_irq(wq, condition, lock, cmd) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ +#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ spin_lock_irq(&lock)) /** @@ -797,7 +770,7 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * condition is checked under the lock. This * is expected to be called with the lock * taken. - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. @@ -806,7 +779,7 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. + * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -815,11 +788,11 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ -#define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, cmd); \ +#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** @@ -827,14 +800,14 @@ do { \ * condition is checked under the lock. This * is expected to be called with the lock * taken. - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. + * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -842,26 +815,26 @@ do { \ * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ -#define wait_event_lock_irq(wq, condition, lock) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, ); \ +#define wait_event_lock_irq(wq_head, condition, lock) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) -#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ +#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ + ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. @@ -870,7 +843,7 @@ do { \ * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is - * checked each time the waitqueue @wq is woken up. + * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -882,27 +855,27 @@ do { \ * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ -({ \ - int __ret = 0; \ - if (!(condition)) \ - __ret = __wait_event_interruptible_lock_irq(wq, \ - condition, lock, cmd); \ - __ret; \ +#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_interruptible_lock_irq(wq_head, \ + condition, lock, cmd); \ + __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is - * checked each time the waitqueue @wq is woken up. + * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -913,28 +886,28 @@ do { \ * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ -#define wait_event_interruptible_lock_irq(wq, condition, lock) \ -({ \ - int __ret = 0; \ - if (!(condition)) \ - __ret = __wait_event_interruptible_lock_irq(wq, \ - condition, lock,); \ - __ret; \ +#define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_interruptible_lock_irq(wq_head, \ + condition, lock,); \ + __ret; \ }) -#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ - lock, timeout) \ - ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, timeout, \ - spin_unlock_irq(&lock); \ - __ret = schedule_timeout(__ret); \ +#define __wait_event_interruptible_lock_irq_timeout(wq_head, condition, \ + lock, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + spin_unlock_irq(&lock); \ + __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. @@ -942,7 +915,7 @@ do { \ * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is - * checked each time the waitqueue @wq is woken up. + * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -954,263 +927,42 @@ do { \ * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ -#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ - timeout) \ -({ \ - long __ret = timeout; \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_interruptible_lock_irq_timeout( \ - wq, condition, lock, timeout); \ - __ret; \ +#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ + timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_interruptible_lock_irq_timeout( \ + wq_head, condition, lock, timeout); \ + __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ -void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); -void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); - -#define DEFINE_WAIT_FUNC(name, function) \ - wait_queue_t name = { \ - .private = current, \ - .func = function, \ - .task_list = LIST_HEAD_INIT((name).task_list), \ +void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); + +#define DEFINE_WAIT_FUNC(name, function) \ + struct wait_queue_entry name = { \ + .private = current, \ + .func = function, \ + .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define DEFINE_WAIT_BIT(name, word, bit) \ - struct wait_bit_queue name = { \ - .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ - .wait = { \ - .private = current, \ - .func = wake_bit_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ - }, \ - } - -#define init_wait(wait) \ - do { \ - (wait)->private = current; \ - (wait)->func = autoremove_wake_function; \ - INIT_LIST_HEAD(&(wait)->task_list); \ - (wait)->flags = 0; \ +#define init_wait(wait) \ + do { \ + (wait)->private = current; \ + (wait)->func = autoremove_wake_function; \ + INIT_LIST_HEAD(&(wait)->entry); \ + (wait)->flags = 0; \ } while (0) - -extern int bit_wait(struct wait_bit_key *, int); -extern int bit_wait_io(struct wait_bit_key *, int); -extern int bit_wait_timeout(struct wait_bit_key *, int); -extern int bit_wait_io_timeout(struct wait_bit_key *, int); - -/** - * wait_on_bit - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit. - * For instance, if one were to have waiters on a bitflag, one would - * call wait_on_bit() in threads waiting for the bit to clear. - * One uses wait_on_bit() where one is waiting for the bit to clear, - * but has no intention of setting it. - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, - bit_wait, - mode); -} - -/** - * wait_on_bit_io - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), but calls - * io_schedule() instead of schedule() for the actual waiting. - * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit_io(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, - bit_wait_io, - mode); -} - -/** - * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * @timeout: timeout, in jiffies - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), except also takes a - * timeout parameter. - * - * Returned value will be zero if the bit was cleared before the - * @timeout elapsed, or non-zero if the @timeout elapsed or process - * received a signal and the mode permitted wakeup on that signal. - */ -static inline int -wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, - unsigned long timeout) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_timeout(word, bit, - bit_wait_timeout, - mode, timeout); -} - -/** - * wait_on_bit_action - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared, and allow the waiting action to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. - * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, - unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, action, mode); -} - -/** - * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit - * when one intends to set it, for instance, trying to lock bitflags. - * For instance, if one were to have waiters trying to set bitflag - * and waiting for it to clear before setting it, one would call - * wait_on_bit() in threads waiting to be able to set the bit. - * One uses wait_on_bit_lock() where one is waiting for the bit to - * clear with the intention of setting it, and when done, clearing it. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); -} - -/** - * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to atomically set it. This is similar - * to wait_on_bit(), but calls io_schedule() instead of schedule() - * for the actual waiting. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); -} - -/** - * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to set it, and allow the waiting action - * to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, - unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, action, mode); -} - -/** - * wait_on_atomic_t - Wait for an atomic_t to become 0 - * @val: The atomic value being waited on, a kernel virtual address - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for - * the purpose of getting a waitqueue, but we set the key to a bit number - * outside of the target 'word'. - */ -static inline -int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) -{ - might_sleep(); - if (atomic_read(val) == 0) - return 0; - return out_of_line_wait_on_atomic_t(val, action, mode); -} - #endif /* _LINUX_WAIT_H */ diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h new file mode 100644 index 000000000000..12b26660d7e9 --- /dev/null +++ b/include/linux/wait_bit.h @@ -0,0 +1,261 @@ +#ifndef _LINUX_WAIT_BIT_H +#define _LINUX_WAIT_BIT_H + +/* + * Linux wait-bit related types and methods: + */ +#include <linux/wait.h> + +struct wait_bit_key { + void *flags; + int bit_nr; +#define WAIT_ATOMIC_T_BIT_NR -1 + unsigned long timeout; +}; + +struct wait_bit_queue_entry { + struct wait_bit_key key; + struct wait_queue_entry wq_entry; +}; + +#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ + { .flags = word, .bit_nr = bit, } + +#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ + { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } + +typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); +int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); +int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); +void wake_up_bit(void *word, int bit); +void wake_up_atomic_t(atomic_t *p); +int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); +int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); +int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); +int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode); +struct wait_queue_head *bit_waitqueue(void *word, int bit); +extern void __init wait_bit_init(void); + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); + +#define DEFINE_WAIT_BIT(name, word, bit) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ + .wq_entry = { \ + .private = current, \ + .func = wake_bit_function, \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ + }, \ + } + +extern int bit_wait(struct wait_bit_key *key, int bit); +extern int bit_wait_io(struct wait_bit_key *key, int bit); +extern int bit_wait_timeout(struct wait_bit_key *key, int bit); +extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit); + +/** + * wait_on_bit - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that waits on a bit. + * For instance, if one were to have waiters on a bitflag, one would + * call wait_on_bit() in threads waiting for the bit to clear. + * One uses wait_on_bit() where one is waiting for the bit to clear, + * but has no intention of setting it. + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait, + mode); +} + +/** + * wait_on_bit_io - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared. This is similar to wait_on_bit(), but calls + * io_schedule() instead of schedule() for the actual waiting. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit_io(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait_io, + mode); +} + +/** + * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * @timeout: timeout, in jiffies + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared. This is similar to wait_on_bit(), except also takes a + * timeout parameter. + * + * Returned value will be zero if the bit was cleared before the + * @timeout elapsed, or non-zero if the @timeout elapsed or process + * received a signal and the mode permitted wakeup on that signal. + */ +static inline int +wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, + unsigned long timeout) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_timeout(word, bit, + bit_wait_timeout, + mode, timeout); +} + +/** + * wait_on_bit_action - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared, and allow the waiting action to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, action, mode); +} + +/** + * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that waits on a bit + * when one intends to set it, for instance, trying to lock bitflags. + * For instance, if one were to have waiters trying to set bitflag + * and waiting for it to clear before setting it, one would call + * wait_on_bit() in threads waiting to be able to set the bit. + * One uses wait_on_bit_lock() where one is waiting for the bit to + * clear with the intention of setting it, and when done, clearing it. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); +} + +/** + * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to atomically set it. This is similar + * to wait_on_bit(), but calls io_schedule() instead of schedule() + * for the actual waiting. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); +} + +/** + * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to set it, and allow the waiting action + * to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, action, mode); +} + +/** + * wait_on_atomic_t - Wait for an atomic_t to become 0 + * @val: The atomic value being waited on, a kernel virtual address + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for + * the purpose of getting a waitqueue, but we set the key to a bit number + * outside of the target 'word'. + */ +static inline +int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) +{ + might_sleep(); + if (atomic_read(val) == 0) + return 0; + return out_of_line_wait_on_atomic_t(val, action, mode); +} + +#endif /* _LINUX_WAIT_BIT_H */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..75e612a45824 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -62,7 +62,7 @@ struct unix_sock { #define UNIX_GC_CANDIDATE 0 #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; - wait_queue_t peer_wake; + wait_queue_entry_t peer_wake; }; static inline struct unix_sock *unix_sk(const struct sock *sk) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e3facb356838..91dc089d65b7 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -742,6 +742,7 @@ TRACE_EVENT(rcu_torture_read, * "OnlineQ": _rcu_barrier() found online CPU with callbacks. * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. + * "IRQNQ": An rcu_barrier_callback() callback found no callbacks. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. * "Inc2": _rcu_barrier() piggyback check counter incremented. diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 1abaf62c86fc..9c4eca6b374a 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -151,7 +151,18 @@ typedef struct siginfo { #define si_arch _sifields._sigsys._arch #endif -#ifndef __KERNEL__ +#ifdef __KERNEL__ +#define __SI_MASK 0xffff0000u +#define __SI_KILL (0 << 16) +#define __SI_TIMER (1 << 16) +#define __SI_POLL (2 << 16) +#define __SI_FAULT (3 << 16) +#define __SI_CHLD (4 << 16) +#define __SI_RT (5 << 16) +#define __SI_MESGQ (6 << 16) +#define __SI_SYS (7 << 16) +#define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) +#else /* __KERNEL__ */ #define __SI_KILL 0 #define __SI_TIMER 0 #define __SI_POLL 0 @@ -161,7 +172,7 @@ typedef struct siginfo { #define __SI_MESGQ 0 #define __SI_SYS 0 #define __SI_CODE(T,N) (N) -#endif +#endif /* __KERNEL__ */ /* * si_code values diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index aa63451ef20a..1953f8d6063b 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -26,7 +26,7 @@ #define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION /* - * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed + * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed * back to the kernel via ioctl from userspace. On architectures where 32- and * 64-bit userspace binaries can be executed it's important that the size of * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we @@ -49,7 +49,7 @@ struct autofs_packet_hdr { struct autofs_packet_missing { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 7c6da423d54e..65b72d0222e7 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -108,7 +108,7 @@ enum autofs_notify { /* v4 multi expire (via pipe) */ struct autofs_packet_expire_multi { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; @@ -123,7 +123,7 @@ union autofs_packet_union { /* autofs v5 common packet struct */ struct autofs_v5_packet { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; __u32 dev; __u64 ino; __u32 uid; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5f0fe019a720..e2a6c7b3510b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,5 +47,6 @@ * For the sched_{set,get}attr() calls */ #define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index e75e1b6ff27f..09299fcb842a 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -54,7 +54,11 @@ struct itimerval { #define CLOCK_BOOTTIME 7 #define CLOCK_REALTIME_ALARM 8 #define CLOCK_BOOTTIME_ALARM 9 -#define CLOCK_SGI_CYCLE 10 /* Hardware specific */ +/* + * The driver implementing this got removed. The clock ID is kept as a + * place holder. Do not reuse! + */ +#define CLOCK_SGI_CYCLE 10 #define CLOCK_TAI 11 #define MAX_CLOCKS 16 diff --git a/init/Kconfig b/init/Kconfig index 1d3475fc9496..bc4c180c66a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -472,354 +472,7 @@ config TASK_IO_ACCOUNTING endmenu # "CPU/Task time and stats accounting" -menu "RCU Subsystem" - -config TREE_RCU - bool - default y if !PREEMPT && SMP - help - This option selects the RCU implementation that is - designed for very large SMP system with hundreds or - thousands of CPUs. It also scales down nicely to - smaller systems. - -config PREEMPT_RCU - bool - default y if PREEMPT - help - This option selects the RCU implementation that is - designed for very large SMP systems with hundreds or - thousands of CPUs, but for which real-time response - is also required. It also scales down nicely to - smaller systems. - - Select this option if you are unsure. - -config TINY_RCU - bool - default y if !PREEMPT && !SMP - help - This option selects the RCU implementation that is - designed for UP systems from which real-time response - is not required. This option greatly reduces the - memory footprint of RCU. - -config RCU_EXPERT - bool "Make expert-level adjustments to RCU configuration" - default n - help - This option needs to be enabled if you wish to make - expert-level adjustments to RCU configuration. By default, - no such adjustments can be made, which has the often-beneficial - side-effect of preventing "make oldconfig" from asking you all - sorts of detailed questions about how you would like numerous - obscure RCU options to be set up. - - Say Y if you need to make expert-level adjustments to RCU. - - Say N if you are unsure. - -config SRCU - bool - default y - help - This option selects the sleepable version of RCU. This version - permits arbitrary sleeping or blocking within RCU read-side critical - sections. - -config CLASSIC_SRCU - bool "Use v4.11 classic SRCU implementation" - default n - depends on RCU_EXPERT && SRCU - help - This option selects the traditional well-tested classic SRCU - implementation from v4.11, as might be desired for enterprise - Linux distributions. Without this option, the shiny new - Tiny SRCU and Tree SRCU implementations are used instead. - At some point, it is hoped that Tiny SRCU and Tree SRCU - will accumulate enough test time and confidence to allow - Classic SRCU to be dropped entirely. - - Say Y if you need a rock-solid SRCU. - - Say N if you would like help test Tree SRCU. - -config TINY_SRCU - bool - default y if SRCU && TINY_RCU && !CLASSIC_SRCU - help - This option selects the single-CPU non-preemptible version of SRCU. - -config TREE_SRCU - bool - default y if SRCU && !TINY_RCU && !CLASSIC_SRCU - help - This option selects the full-fledged version of SRCU. - -config TASKS_RCU - bool - default n - select SRCU - help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. - -config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) - help - This option enables RCU CPU stall code that is common between - the TINY and TREE variants of RCU. The purpose is to allow - the tiny variants to disable RCU CPU stall warnings, while - making these warnings mandatory for the tree variants. - -config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU ) - -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - -config RCU_FANOUT - int "Tree-based hierarchical RCU fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 64 if 64BIT - default 32 if !64BIT - help - This option controls the fanout of hierarchical implementations - of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the fourth - root of NR_CPUS, which allows NR_CPUS to be insanely large. - The default value of RCU_FANOUT should be used for production - systems, but if you are stress-testing the RCU implementation - itself, small RCU_FANOUT values allow you to test large-system - code paths on small(er) systems. - - Select a specific number if testing RCU itself. - Take the default if unsure. - -config RCU_FANOUT_LEAF - int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 16 - help - This option controls the leaf-level fanout of hierarchical - implementations of RCU, and allows trading off cache misses - against lock contention. Systems that synchronize their - scheduling-clock interrupts for energy-efficiency reasons will - want the default because the smaller leaf-level fanout keeps - lock contention levels acceptably low. Very large systems - (hundreds or thousands of CPUs) will instead want to set this - value to the maximum value possible in order to reduce the - number of cache misses incurred during RCU's grace-period - initialization. These systems tend to run CPU-bound, and thus - are not helped by synchronized interrupts, and thus tend to - skew them, which reduces lock contention enough that large - leaf-level fanouts work well. That said, setting leaf-level - fanout to a large number will likely cause problematic - lock contention on the leaf-level rcu_node structures unless - you boot with the skew_tick kernel parameter. - - Select a specific number if testing RCU itself. - - Select the maximum permissible value for large systems, but - please understand that you may also need to set the skew_tick - kernel boot parameter to avoid contention on the rcu_node - structure's locks. - - Take the default if unsure. - -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - -config TREE_RCU_TRACE - def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU ) - select DEBUG_FS - help - This option provides tracing for the TREE_RCU and - PREEMPT_RCU implementations, permitting Makefile to - trivially select kernel/rcutree_trace.c. - -config RCU_BOOST - bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n - help - This option boosts the priority of preempted RCU readers that - block the current preemptible RCU grace period for too long. - This option also prevents heavy loads from blocking RCU - callback invocation for all flavors of RCU. - - Say Y here if you are working with real-time apps or heavy loads - Say N here if you are unsure. - -config RCU_KTHREAD_PRIO - int "Real-time priority to use for RCU worker threads" - range 1 99 if RCU_BOOST - range 0 99 if !RCU_BOOST - default 1 if RCU_BOOST - default 0 if !RCU_BOOST - depends on RCU_EXPERT - help - This option specifies the SCHED_FIFO priority value that will be - assigned to the rcuc/n and rcub/n threads and is also the value - used for RCU_BOOST (if enabled). If you are working with a - real-time application that has one or more CPU-bound threads - running at a real-time priority level, you should set - RCU_KTHREAD_PRIO to a priority higher than the highest-priority - real-time CPU-bound application thread. The default RCU_KTHREAD_PRIO - value of 1 is appropriate in the common case, which is real-time - applications that do not have any CPU-bound threads. - - Some real-time applications might not have a single real-time - thread that saturates a given CPU, but instead might have - multiple real-time threads that, taken together, fully utilize - that CPU. In this case, you should set RCU_KTHREAD_PRIO to - a priority higher than the lowest-priority thread that is - conspiring to prevent the CPU from running any non-real-time - tasks. For example, if one thread at priority 10 and another - thread at priority 5 are between themselves fully consuming - the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be - set to priority 6 or higher. - - Specify the real-time priority, or take the default if unsure. - -config RCU_BOOST_DELAY - int "Milliseconds to delay boosting after RCU grace-period start" - range 0 3000 - depends on RCU_BOOST - default 500 - help - This option specifies the time to wait after the beginning of - a given grace period before priority-boosting preempted RCU - readers blocking that grace period. Note that any RCU reader - blocking an expedited RCU grace period is boosted immediately. - - Accept the default if unsure. - -config RCU_NOCB_CPU - bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU - depends on RCU_EXPERT || NO_HZ_FULL - default n - help - Use this option to reduce OS jitter for aggressive HPC or - real-time workloads. It can also be used to offload RCU - callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. - - This option offloads callback invocation from the set of - CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuox/N") will be created to - invoke callbacks, where the "N" is the CPU being offloaded, - and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and - "s" for RCU-sched. Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. - Say N here if you are unsure. - -choice - prompt "Build-forced no-CBs CPUs" - default RCU_NOCB_CPU_NONE - depends on RCU_NOCB_CPU - help - This option allows no-CBs CPUs (whose RCU callbacks are invoked - from kthreads rather than from softirq context) to be specified - at build time. Additional no-CBs CPUs may be specified by - the rcu_nocbs= boot parameter. - -config RCU_NOCB_CPU_NONE - bool "No build_forced no-CBs CPUs" - help - This option does not force any of the CPUs to be no-CBs CPUs. - Only CPUs designated by the rcu_nocbs= boot parameter will be - no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU - kthreads whose names begin with "rcuo". All other CPUs will - invoke their own RCU callbacks in softirq context. - - Select this option if you want to choose no-CBs CPUs at - boot time, for example, to allow testing of different no-CBs - configurations without having to rebuild the kernel each time. - -config RCU_NOCB_CPU_ZERO - bool "CPU 0 is a build_forced no-CBs CPU" - help - This option forces CPU 0 to be a no-CBs CPU, so that its RCU - callbacks are invoked by a per-CPU kthread whose name begins - with "rcuo". Additional CPUs may be designated as no-CBs - CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs. - All other CPUs will invoke their own RCU callbacks in softirq - context. - - Select this if CPU 0 needs to be a no-CBs CPU for real-time - or energy-efficiency reasons, but the real reason it exists - is to ensure that randconfig testing covers mixed systems. - -config RCU_NOCB_CPU_ALL - bool "All CPUs are build_forced no-CBs CPUs" - help - This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= - boot parameter will be ignored. All CPUs' RCU callbacks will - be executed in the context of per-CPU rcuo kthreads created for - this purpose. Assuming that the kthreads whose names start with - "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter - on the remaining CPUs, but might decrease memory locality during - RCU-callback invocation, thus potentially degrading throughput. - - Select this if all CPUs need to be no-CBs CPUs for real-time - or energy-efficiency reasons. - -endchoice - -endmenu # "RCU Subsystem" +source "kernel/rcu/Kconfig" config BUILD_BIN2C bool diff --git a/init/main.c b/init/main.c index f866510472d7..df58a416dd1d 100644 --- a/init/main.c +++ b/init/main.c @@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __ref rest_init(void) { + struct task_struct *tsk; int pid; rcu_scheduler_starting(); @@ -397,12 +398,32 @@ static noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - kernel_thread(kernel_init, NULL, CLONE_FS); + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PRREMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + complete(&kthreadd_done); /* @@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void) * init can allocate pages on any node */ set_mems_allowed(node_states[N_MEMORY]); - /* - * init can run on any cpu. - */ - set_cpus_allowed_ptr(current, cpu_all_mask); cad_pid = task_pid(current); diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..ebd8bdc3fd68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,100 +30,66 @@ #include <linux/uaccess.h> -static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { - memset(txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc->modes, &utp->modes) || - __get_user(txc->offset, &utp->offset) || - __get_user(txc->freq, &utp->freq) || - __get_user(txc->maxerror, &utp->maxerror) || - __get_user(txc->esterror, &utp->esterror) || - __get_user(txc->status, &utp->status) || - __get_user(txc->constant, &utp->constant) || - __get_user(txc->precision, &utp->precision) || - __get_user(txc->tolerance, &utp->tolerance) || - __get_user(txc->time.tv_sec, &utp->time.tv_sec) || - __get_user(txc->time.tv_usec, &utp->time.tv_usec) || - __get_user(txc->tick, &utp->tick) || - __get_user(txc->ppsfreq, &utp->ppsfreq) || - __get_user(txc->jitter, &utp->jitter) || - __get_user(txc->shift, &utp->shift) || - __get_user(txc->stabil, &utp->stabil) || - __get_user(txc->jitcnt, &utp->jitcnt) || - __get_user(txc->calcnt, &utp->calcnt) || - __get_user(txc->errcnt, &utp->errcnt) || - __get_user(txc->stbcnt, &utp->stbcnt)) - return -EFAULT; + struct compat_timex tx32; - return 0; -} - -static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) -{ - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc->modes, &utp->modes) || - __put_user(txc->offset, &utp->offset) || - __put_user(txc->freq, &utp->freq) || - __put_user(txc->maxerror, &utp->maxerror) || - __put_user(txc->esterror, &utp->esterror) || - __put_user(txc->status, &utp->status) || - __put_user(txc->constant, &utp->constant) || - __put_user(txc->precision, &utp->precision) || - __put_user(txc->tolerance, &utp->tolerance) || - __put_user(txc->time.tv_sec, &utp->time.tv_sec) || - __put_user(txc->time.tv_usec, &utp->time.tv_usec) || - __put_user(txc->tick, &utp->tick) || - __put_user(txc->ppsfreq, &utp->ppsfreq) || - __put_user(txc->jitter, &utp->jitter) || - __put_user(txc->shift, &utp->shift) || - __put_user(txc->stabil, &utp->stabil) || - __put_user(txc->jitcnt, &utp->jitcnt) || - __put_user(txc->calcnt, &utp->calcnt) || - __put_user(txc->errcnt, &utp->errcnt) || - __put_user(txc->stbcnt, &utp->stbcnt) || - __put_user(txc->tai, &utp->tai)) + if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; - return 0; -} -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timespec64 new_ts; - struct timeval user_tv; - struct timezone new_tz; - - if (tv) { - if (compat_get_timeval(&user_tv, tv)) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) +{ + struct compat_timex tx32; + + memset(&tx32, 0, sizeof(struct compat_timex)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) + return -EFAULT; + return 0; } static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) @@ -213,141 +179,28 @@ int compat_convert_timespec(struct timespec __user **kts, return 0; } -static long compat_nanosleep_restart(struct restart_block *restart) -{ - struct compat_timespec __user *rmtp; - struct timespec rmt; - mm_segment_t oldfs; - long ret; - - restart->nanosleep.rmtp = (struct timespec __user *) &rmt; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep_restart(restart); - set_fs(oldfs); - - if (ret == -ERESTART_RESTARTBLOCK) { - rmtp = restart->nanosleep.compat_rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { - struct timespec tu, rmt; - struct timespec64 tu64; - mm_segment_t oldfs; - long ret; + struct compat_itimerval v32; - if (compat_get_timespec(&tu, rqtp)) + if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) return -EFAULT; - - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) - return -EINVAL; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu64, - rmtp ? (struct timespec __user *)&rmt : NULL, - HRTIMER_MODE_REL, CLOCK_MONOTONIC); - set_fs(oldfs); - - /* - * hrtimer_nanosleep() can only return 0 or - * -ERESTART_RESTARTBLOCK here because: - * - * - we call it with HRTIMER_MODE_REL and therefor exclude the - * -ERESTARTNOHAND return path. - * - * - we supply the rmtp argument from the task stack (due to - * the necessary compat conversion. So the update cannot - * fail, which excludes the -EFAULT return path as well. If - * it fails nevertheless we have a bigger problem and wont - * reach this place anymore. - * - * - if the return value is 0, we do not have to update rmtp - * because there is no remaining time. - * - * We check for -ERESTART_RESTARTBLOCK nevertheless if the - * core implementation decides to return random nonsense. - */ - if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart = ¤t->restart_block; - - restart->fn = compat_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - return ret; -} - -static inline long get_compat_itimerval(struct itimerval *o, - struct compat_itimerval __user *i) -{ - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | - __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | - __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | - __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); -} - -static inline long put_compat_itimerval(struct compat_itimerval __user *o, - struct itimerval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | - __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | - __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | - __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); -} - -asmlinkage long sys_ni_posix_timers(void); - -COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) -{ - struct itimerval kit; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); - - error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) - error = -EFAULT; - return error; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; } -COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) +int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) { - struct itimerval kin, kout; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); + struct compat_itimerval v32; - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - if (put_compat_itimerval(out, &kout)) - return -EFAULT; - return 0; + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; } static compat_clock_t clock_t_to_compat_clock_t(clock_t x) @@ -689,193 +542,6 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } -COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, - struct compat_sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - struct sigevent __user *event = NULL; - - if (timer_event_spec) { - struct sigevent kevent; - - event = compat_alloc_user_space(sizeof(*event)); - if (get_compat_sigevent(&kevent, timer_event_spec) || - copy_to_user(event, &kevent, sizeof(*event))) - return -EFAULT; - } - - return sys_timer_create(which_clock, event, created_timer_id); -} - -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) -{ - long err; - mm_segment_t oldfs; - struct itimerspec newts, oldts; - - if (!new) - return -EINVAL; - if (get_compat_itimerspec(&newts, new)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_settime(timer_id, flags, - (struct itimerspec __user *) &newts, - (struct itimerspec __user *) &oldts); - set_fs(oldfs); - if (!err && old && put_compat_itimerspec(old, &oldts)) - return -EFAULT; - return err; -} - -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) -{ - long err; - mm_segment_t oldfs; - struct itimerspec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_gettime(timer_id, - (struct itimerspec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_itimerspec(setting, &ts)) - return -EFAULT; - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - if (compat_get_timespec(&ts, tp)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_settime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_gettime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) -{ - struct timex txc; - mm_segment_t oldfs; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); - set_fs(oldfs); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_getres(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && tp && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -static long compat_clock_nanosleep_restart(struct restart_block *restart) -{ - long err; - mm_segment_t oldfs; - struct timespec tu; - struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; - - restart->nanosleep.rmtp = (struct timespec __user *) &tu; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = clock_nanosleep_restart(restart); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&tu, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) -{ - long err; - mm_segment_t oldfs; - struct timespec in, out; - struct restart_block *restart; - - if (compat_get_timespec(&in, rqtp)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_nanosleep(which_clock, flags, - (struct timespec __user *) &in, - (struct timespec __user *) &out); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&out, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t->restart_block; - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes @@ -1035,64 +701,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_TIME - -/* compat_time_t is a 32 bit "long" and needs to get converted. */ - -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) -{ - compat_time_t i; - struct timeval tv; - - do_gettimeofday(&tv); - i = tv.tv_sec; - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ - -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) -{ - struct timex txc; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - ret = do_adjtimex(&txc); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/cpu.c b/kernel/cpu.c index cb5103413bd8..85840bd2df37 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,6 +27,7 @@ #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -65,6 +66,12 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) +static struct lock_class_key cpuhp_state_key; +static struct lockdep_map cpuhp_state_lock_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +#endif + /** * cpuhp_step - Hotplug state machine step * @name: Name of the step @@ -196,121 +203,41 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. +/* + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -static struct { - struct task_struct *active_writer; - /* wait queue to wake up the active_writer */ - wait_queue_head_t wq; - /* verifies that no writer will get active while readers are active */ - struct mutex lock; - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - atomic_t refcount; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} cpu_hotplug = { - .active_writer = NULL, - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), -#endif -}; - -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire_tryread() \ - lock_map_acquire_tryread(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) -#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); - -void get_online_cpus(void) +void cpus_read_lock(void) { - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - cpuhp_lock_acquire_read(); - mutex_lock(&cpu_hotplug.lock); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); + percpu_down_read(&cpu_hotplug_lock); } -EXPORT_SYMBOL_GPL(get_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_lock); -void put_online_cpus(void) +void cpus_read_unlock(void) { - int refcount; - - if (cpu_hotplug.active_writer == current) - return; - - refcount = atomic_dec_return(&cpu_hotplug.refcount); - if (WARN_ON(refcount < 0)) /* try to fix things up */ - atomic_inc(&cpu_hotplug.refcount); - - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) - wake_up(&cpu_hotplug.wq); - - cpuhp_lock_release(); - + percpu_up_read(&cpu_hotplug_lock); } -EXPORT_SYMBOL_GPL(put_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_unlock); -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ -void cpu_hotplug_begin(void) +void cpus_write_lock(void) { - DEFINE_WAIT(wait); - - cpu_hotplug.active_writer = current; - cpuhp_lock_acquire(); + percpu_down_write(&cpu_hotplug_lock); +} - for (;;) { - mutex_lock(&cpu_hotplug.lock); - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); - if (likely(!atomic_read(&cpu_hotplug.refcount))) - break; - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } - finish_wait(&cpu_hotplug.wq, &wait); +void cpus_write_unlock(void) +{ + percpu_up_write(&cpu_hotplug_lock); } -void cpu_hotplug_done(void) +void lockdep_assert_cpus_held(void) { - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); - cpuhp_lock_release(); + percpu_rwsem_assert_held(&cpu_hotplug_lock); } /* @@ -344,8 +271,6 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Notifier wrappers for transitioning to state machine */ - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -484,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; + lock_map_acquire(&cpuhp_state_lock_map); /* Single callback invocation for [un]install ? */ if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { @@ -510,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu) else if (st->state > st->target) ret = cpuhp_ap_offline(cpu, st); } + lock_map_release(&cpuhp_state_lock_map); st->result = ret; complete(&st->done); } @@ -524,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); + /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. @@ -567,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state state = st->state; trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); @@ -630,30 +562,6 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int dead_cpu) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (!p->on_rq) - continue; - /* - * We do the check with unlocked task_rq(p)->lock. - * Order the reading to do not warn about a task, - * which was running on this cpu in the past, and - * it's just been woken on another cpu. - */ - rmb(); - if (task_cpu(p) != dead_cpu) - continue; - - pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", - p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } - read_unlock(&tasklist_lock); -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -701,7 +609,7 @@ static int takedown_cpu(unsigned int cpu) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); + err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); @@ -773,7 +681,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; - cpu_hotplug_begin(); + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -811,7 +719,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, } out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -893,7 +801,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) struct task_struct *idle; int ret = 0; - cpu_hotplug_begin(); + cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; @@ -941,7 +849,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -1413,18 +1321,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, - bool invoke) +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, + bool invoke) { struct cpuhp_step *sp; int cpu; int ret; + lockdep_assert_cpus_held(); + sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; - get_online_cpus(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,13 +1363,23 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + return ret; +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** - * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state @@ -1468,25 +1388,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @multi_instance: State is set up for multiple instances which get * added afterwards. * + * The caller needs to hold cpus read locked while calling this function. * Returns: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN * 0 for all other states * On failure: proper (negative) error code */ -int __cpuhp_setup_state(enum cpuhp_state state, - const char *name, bool invoke, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; bool dynstate; + lockdep_assert_cpus_held(); + if (cpuhp_cb_check(state) || !name) return -EINVAL; - get_online_cpus(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1444,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1531,6 +1452,22 @@ out: return state; return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); + +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, + teardown, multi_instance); + cpus_read_unlock(); + return ret; +} EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, @@ -1544,7 +1481,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, if (!sp->multi_instance) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) @@ -1565,29 +1502,30 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** - * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * + * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ -void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,7 +1553,14 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + cpus_read_lock(); + __cpuhp_remove_state_cpuslocked(state, invoke); + cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/events/core.c b/kernel/events/core.c index 9afab55c68c2..1538df9b2b65 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static cpumask_var_t perf_online_mask; /* * perf event paranoia level: @@ -925,11 +926,6 @@ static inline int is_cgroup_event(struct perf_event *event) return 0; } -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - static inline void update_cgrp_time_from_event(struct perf_event *event) { } @@ -3825,14 +3821,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -5742,9 +5730,6 @@ static void perf_output_read_one(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -5789,6 +5774,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) +/* + * XXX PERF_SAMPLE_READ vs inherited events seems difficult. + * + * The problem is that its both hard and excessively expensive to iterate the + * child list, not to mention that its impossible to IPI the children running + * on another CPU, from interrupt/NMI context. + */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { @@ -7737,7 +7729,8 @@ static int swevent_hlist_get_cpu(int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && + cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -7758,7 +7751,7 @@ static int swevent_hlist_get(void) { int err, cpu, failed_cpu; - get_online_cpus(); + mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { @@ -7766,8 +7759,7 @@ static int swevent_hlist_get(void) goto fail; } } - put_online_cpus(); - + mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { @@ -7775,8 +7767,7 @@ fail: break; swevent_hlist_put_cpu(cpu); } - - put_online_cpus(); + mutex_unlock(&pmus_lock); return err; } @@ -8950,7 +8941,7 @@ perf_event_mux_interval_ms_store(struct device *dev, pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); @@ -8959,7 +8950,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpu_function_call(cpu, (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; @@ -9089,6 +9080,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); } @@ -9202,7 +9194,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { - struct pmu *pmu = NULL; + struct pmu *pmu; int idx; int ret; @@ -9471,9 +9463,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited events + * We currently do not support PERF_SAMPLE_READ on inherited events. + * See perf_output_read(). */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) @@ -9486,9 +9479,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } pmu = perf_init_event(event); - if (!pmu) - goto err_ns; - else if (IS_ERR(pmu)) { + if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } @@ -9501,8 +9492,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, sizeof(unsigned long), GFP_KERNEL); - if (!event->addr_filters_offs) + if (!event->addr_filters_offs) { + err = -ENOMEM; goto err_per_task; + } /* force hw sync on the address filters */ event->addr_filters_gen = 1; @@ -9912,12 +9905,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - get_online_cpus(); - if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cpus; + goto err_task; /* * Reuse ptrace permission checks for now. @@ -10103,6 +10094,23 @@ SYSCALL_DEFINE5(perf_event_open, goto err_locked; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } + } + + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. @@ -10192,8 +10200,6 @@ SYSCALL_DEFINE5(perf_event_open, put_task_struct(task); } - put_online_cpus(); - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -10227,8 +10233,6 @@ err_alloc: err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); -err_cpus: - put_online_cpus(); err_task: if (task) put_task_struct(task); @@ -10283,6 +10287,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + if (!cpuctx->online) { + err = -ENODEV; + goto err_unlock; + } + } + if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; @@ -10950,6 +10969,8 @@ static void __init perf_event_init_all_cpus(void) struct swevent_htable *swhash; int cpu; + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); @@ -10965,7 +10986,7 @@ static void __init perf_event_init_all_cpus(void) } } -int perf_event_init_cpu(unsigned int cpu) +void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10978,7 +10999,6 @@ int perf_event_init_cpu(unsigned int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); - return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10996,19 +11016,22 @@ static void __perf_event_exit_context(void *__info) static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; - int idx; - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + mutex_lock(&pmus_lock); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; mutex_unlock(&ctx->mutex); } - srcu_read_unlock(&pmus_srcu, idx); + cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); } #else @@ -11016,6 +11039,29 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +int perf_event_init_cpu(unsigned int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); + cpumask_set_cpu(cpu, perf_online_mask); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); + } + mutex_unlock(&pmus_lock); + + return 0; +} + int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..c63226283aef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w) rcu_read_unlock(); } -struct task_struct *try_get_task_struct(struct task_struct **ptask) -{ - struct task_struct *task; - - rcu_read_lock(); - task = task_rcu_dereference(ptask); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - return task; -} - /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected @@ -1004,7 +991,7 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; - wait_queue_t child_wait; + wait_queue_entry_t child_wait; int notask_error; }; @@ -1541,7 +1528,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, +static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; diff --git a/kernel/futex.c b/kernel/futex.c index b8ae87d227da..c934689043b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -225,7 +225,7 @@ struct futex_pi_state { * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * - * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2c50f0977049..8a5b5970e6b0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -185,37 +185,64 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } +static void irq_state_clr_started(struct irq_desc *desc) +{ + irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); +} + +static void irq_state_set_started(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); +} + int irq_startup(struct irq_desc *desc, bool resend) { int ret = 0; - irq_state_clr_disabled(desc); desc->depth = 0; - irq_domain_activate_irq(&desc->irq_data); - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_masked(desc); - } else { + if (irqd_is_started(&desc->irq_data)) { irq_enable(desc); + } else { + irq_domain_activate_irq(&desc->irq_data); + if (desc->irq_data.chip->irq_startup) { + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_disabled(desc); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + irq_state_set_started(desc); } + if (resend) check_irq_resend(desc); + return ret; } +static void __irq_disable(struct irq_desc *desc, bool mask); + void irq_shutdown(struct irq_desc *desc) { - irq_state_set_disabled(desc); - desc->depth = 1; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else if (desc->irq_data.chip->irq_disable) - desc->irq_data.chip->irq_disable(&desc->irq_data); - else - desc->irq_data.chip->irq_mask(&desc->irq_data); + if (irqd_is_started(&desc->irq_data)) { + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) { + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_state_set_disabled(desc); + irq_state_set_masked(desc); + } else { + __irq_disable(desc, true); + } + irq_state_clr_started(desc); + } + /* + * This must be called even if the interrupt was never started up, + * because the activation can happen before the interrupt is + * available for request/startup. It has it's own state tracking so + * it's safe to call it unconditionally. + */ irq_domain_deactivate_irq(&desc->irq_data); - irq_state_set_masked(desc); } void irq_enable(struct irq_desc *desc) @@ -228,6 +255,17 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +static void __irq_disable(struct irq_desc *desc, bool mask) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } else if (mask) { + mask_irq(desc); + } +} + /** * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled @@ -250,13 +288,7 @@ void irq_enable(struct irq_desc *desc) */ void irq_disable(struct irq_desc *desc) { - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } else if (irq_settings_disable_unlazy(desc)) { - mask_irq(desc); - } + __irq_disable(desc, irq_settings_disable_unlazy(desc)); } void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) @@ -903,6 +935,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (!desc) return; + + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); + irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1613bfd48365..194c506d9d20 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -4,6 +4,8 @@ #include <linux/gfp.h> #include <linux/irq.h> +#include "internals.h" + /* * Device resource management aware IRQ request/free implementation. */ @@ -198,3 +200,87 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, return base; } EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); + +#ifdef CONFIG_GENERIC_IRQ_CHIP +/** + * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip + * for a managed device + * @dev: Device to allocate the generic chip for + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, + unsigned int irq_base, void __iomem *reg_base, + irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = devm_kzalloc(dev, sz, GFP_KERNEL); + if (gc) + irq_init_generic_chip(gc, name, num_ct, + irq_base, reg_base, handler); + + return gc; +} +EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); + +struct irq_generic_chip_devres { + struct irq_chip_generic *gc; + u32 msk; + unsigned int clr; + unsigned int set; +}; + +static void devm_irq_remove_generic_chip(struct device *dev, void *res) +{ + struct irq_generic_chip_devres *this = res; + + irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set); +} + +/** + * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic + * chip for a managed device + * + * @dev: Device to setup the generic chip for + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, + u32 msk, enum irq_gc_flags flags, + unsigned int clr, unsigned int set) +{ + struct irq_generic_chip_devres *dr; + + dr = devres_alloc(devm_irq_remove_generic_chip, + sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + irq_setup_generic_chip(gc, msk, flags, clr, set); + + dr->gc = gc; + dr->msk = msk; + dr->clr = clr; + dr->set = set; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip); +#endif /* CONFIG_GENERIC_IRQ_CHIP */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ee32870079c9..f7086b78ad6e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr) iowrite32be(val, addr); } -static void -irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, - int num_ct, unsigned int irq_base, - void __iomem *reg_base, irq_flow_handler_t handler) +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { raw_spin_lock_init(&gc->lock); gc->num_ct = num_ct; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bc226e783bd2..921a2419720c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -226,3 +226,14 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif + +#ifdef CONFIG_GENERIC_IRQ_CHIP +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler); +#else +static inline void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { } +#endif /* CONFIG_GENERIC_IRQ_CHIP */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 22e443133987..c5bbaed5a5e6 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -480,7 +480,8 @@ int __init early_irq_init(void) /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", + NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) nr_irqs = IRQ_BITMAP_BITS; @@ -516,7 +517,7 @@ int __init early_irq_init(void) init_irq_default_affinity(); - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); desc = irq_desc; count = ARRAY_SIZE(irq_desc); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31805f237396..70b9da72018b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_find_mapping); #ifdef CONFIG_IRQ_DOMAIN_DEBUG +static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_domain *domain; + struct irq_data *data; + + domain = desc->irq_data.domain; + data = &desc->irq_data; + + while (domain) { + unsigned int irq = data->irq; + unsigned long hwirq = data->hwirq; + struct irq_chip *chip; + bool direct; + + if (data == &desc->irq_data) + seq_printf(m, "%5d ", irq); + else + seq_printf(m, "%5d+ ", irq); + seq_printf(m, "0x%05lx ", hwirq); + + chip = irq_data_get_irq_chip(data); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); + + seq_printf(m, data ? "0x%p " : " %p ", + irq_data_get_irq_chip_data(data)); + + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); + seq_printf(m, "%s\n", domain->name); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + domain = domain->parent; + data = data->parent_data; +#else + domain = NULL; +#endif + } +} + static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void *data, **slot; + void **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -760,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private) mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { struct device_node *of_node; + const char *name; + int count = 0; + of_node = irq_domain_get_of_node(domain); + if (of_node) + name = of_node_full_name(of_node); + else if (is_fwnode_irqchip(domain->fwnode)) + name = container_of(domain->fwnode, struct irqchip_fwid, + fwnode)->name; + else + name = ""; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - of_node ? of_node_full_name(of_node) : ""); + name); } mutex_unlock(&irq_domain_mutex); @@ -782,30 +834,7 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); - domain = desc->irq_data.domain; - - if (domain) { - struct irq_chip *chip; - int hwirq = desc->irq_data.hwirq; - bool direct; - - seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05x ", hwirq); - - chip = irq_desc_get_chip(desc); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - data = irq_desc_get_chip_data(desc); - seq_printf(m, data ? "0x%p " : " %p ", data); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", desc->irq_data.domain->name); - } - + virq_debug_show_one(m, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 425170d4439b..4c34696ca575 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -533,9 +533,15 @@ void __enable_irq(struct irq_desc *desc) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); - irq_enable(desc); - check_irq_resend(desc); - /* fall-through */ + /* + * Call irq_startup() not irq_enable() here because the + * interrupt might be marked NOAUTOEN. So irq_startup() + * needs to be invoked when it gets enabled the first + * time. If it was already started up, then irq_startup() + * will invoke irq_enable() under the hood. + */ + irq_startup(desc, true); + break; } default: desc->depth--; @@ -1330,11 +1336,19 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; - if (irq_settings_can_autoenable(desc)) + if (irq_settings_can_autoenable(desc)) { irq_startup(desc, true); - else + } else { + /* + * Shared interrupts do not go well with disabling + * auto enable. The sharing interrupt might request + * it while it's still disabled and then wait for + * interrupts forever. + */ + WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; + } /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ddc2f5427f75..fe4d48ec5bc4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + struct irq_domain *domain; + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + if (domain && info->chip && info->chip->name) + domain->name = info->chip->name; + + return domain; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -15,6 +15,7 @@ #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> +#include <linux/cpu.h> #ifdef HAVE_JUMP_LABEL @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) return; } + cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); + cpus_read_unlock(); return; } @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); + cpus_read_unlock(); } static void jump_label_update_timeout(struct work_struct *work) @@ -334,6 +340,7 @@ void __init jump_label_init(void) if (static_key_initialized) return; + cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -353,6 +360,7 @@ void __init jump_label_init(void) } static_key_initialized = true; jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_MODULES @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, struct module *mod = data; int ret = 0; + cpus_read_lock(); + jump_label_lock(); + switch (val) { case MODULE_STATE_COMING: - jump_label_lock(); ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } - jump_label_unlock(); break; case MODULE_STATE_GOING: - jump_label_lock(); jump_label_del_module(mod); - jump_label_unlock(); break; case MODULE_STATE_LIVE: - jump_label_lock(); jump_label_invalidate_module_init(mod); - jump_label_unlock(); break; } + jump_label_unlock(); + cpus_read_unlock(); + return notifier_from_errno(ret); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index adfe3b4cfe05..6756d750b31b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() + * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ - get_online_cpus(); + lockdep_assert_cpus_held(); + + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + /* See comment in do_optimize_kprobes() */ + lockdep_assert_cpus_held(); + /* Unoptimization must be done anytime */ if (list_empty(&unoptimizing_list)) return; - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) list_del_init(&op->list); } mutex_unlock(&text_mutex); - put_online_cpus(); } /* Reclaim all kprobes on the free_list */ @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); + cpus_read_lock(); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { - get_online_cpus(); + lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); - put_online_cpus(); if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called */ + cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_SYSCTL @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) if (kprobes_allow_optimization) goto out; + cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } + cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) return; } + cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) arm_kprobe_ftrace(kp); return; } - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* Disarm a kprobe with text_mutex */ @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) disarm_kprobe_ftrace(kp); return; } - /* Ditto */ + + cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; + cpus_read_lock(); + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); - /* - * Get online CPUs to avoid text_mutex deadlock.with stop machine, - * which is invoked by unoptimize_kprobe() in add_new_kprobe() - */ - get_online_cpus(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) out: mutex_unlock(&text_mutex); - put_online_cpus(); jump_label_unlock(); + cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) goto out; } - mutex_lock(&text_mutex); /* Avoiding text modification */ + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); + cpus_read_unlock(); if (ret) goto out; @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) /* Try to optimize kprobe */ try_to_optimize_kprobe(p); - out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0e31bfee25c..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("======================================================\n"); pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); pr_warn("------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); + pr_warn("\nwhich lock already depends on the new lock.\n\n"); + pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); pr_warn("-----------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, curr->softirqs_enabled); print_lock(next); - printk("\nand this task is already holding:\n"); + pr_warn("\nand this task is already holding:\n"); print_lock(prev); - printk("which would create a new lock dependency:\n"); + pr_warn("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(KERN_CONT " ->"); + pr_cont(" ->"); print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); + pr_cont("\n"); - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); + pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - printk("\nto a %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); + pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); + pr_warn("..."); print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - printk("\nother info that might help us debug this:\n\n"); + pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); + pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nthe dependencies between the lock to be acquired"); + pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) return 0; print_shortest_lock_dependencies(forwards_entry, next_root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { - printk("\n"); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); pr_warn("----------------------------\n"); - printk("%s/%d: ", current->comm, task_pid_nr(current)); - printk("Hash chain already cached but the contents don't match!\n"); + pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); + pr_warn("Hash chain already cached but the contents don't match!\n"); - printk("Held locks:"); + pr_warn("Held locks:"); print_chain_keys_held_locks(curr, hlock_next); - printk("Locks in cached chain:"); + pr_warn("Locks in cached chain:"); print_chain_keys_chain(chain); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } #endif @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); pr_warn("--------------------------------\n"); - printk("inconsistent {%s} -> {%s} usage.\n", + pr_warn("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, trace_softirqs_enabled(curr)); print_lock(this); - printk("{%s} state was registered at:\n", usage_str[prev_bit]); + pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_usage_bug_scenario(this); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", + pr_warn("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); /* Find a middle lock (if one exists) */ depth = get_lock_depth(other); do { if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); + pr_warn("lockdep:%s bad path found in chain graph\n", __func__); break; } middle = entry; @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) return 0; print_shortest_lock_dependencies(other, root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); pr_warn("----------------------------------\n"); - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); + pr_warn("\nbut this task is not holding:\n"); + pr_warn("%s\n", hlock->nest_lock->name); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); pr_warn("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", + pr_warn("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no more locks to release!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); pr_warn("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", + pr_warn("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no locks held!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) int unlock = 1; if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); + pr_warn("INFO: lockdep is turned off.\n"); return; } - printk("\nShowing all locks held in the system:\n"); + pr_warn("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); + pr_warn("hm, tasklist_lock locked, retrying... "); if (count) { count--; - printk(" #%d", 10-count); + pr_cont(" #%d", 10-count); mdelay(200); goto retry; } - printk(" ignoring it.\n"); + pr_cont(" ignoring it.\n"); unlock = 0; } else { if (count != 10) - printk(KERN_CONT " locked it.\n"); + pr_cont(" locked it.\n"); } do_each_thread(g, p) { @@ -4445,7 +4445,7 @@ retry: unlock = 1; } while_each_thread(g, p); - printk("\n"); + pr_warn("\n"); pr_warn("=============================================\n\n"); if (unlock) @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n"); + pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); pr_warn("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", + pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } @@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); pr_warn("-----------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("%s:%d %s!\n", file, line, s); + pr_warn("\nother info that might help us debug this:\n\n"); + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() @@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * rcu_read_lock_bh() and so on from extended quiescent states. */ if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); + pr_warn("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 58e366ad36f4..ac35e648b0e5 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lock->name = name; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif } diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index b585af9a1b50..5078c6ddf4a5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -11,7 +11,7 @@ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 28cd09e635ed..78069895032a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); @@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { + int ret; + might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); @@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { + int ret; + might_sleep(); - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + int ret; + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { + mutex_release(&lock->dep_map, 1, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); @@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock) lock->magic = NULL; #endif } - EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** @@ -1632,14 +1653,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); * * Initializing of a locked rt lock is not allowed */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) +void __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name); + if (name && key) + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -1660,7 +1683,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - __rt_mutex_init(lock, NULL); + __rt_mutex_init(lock, NULL, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 6607802efa8b..5c253caffe91 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -17,7 +17,7 @@ #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_init(m, n, k) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -934,29 +934,18 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - -/** * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. * * @wq: workqueue to use for the allocated padata instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Must be called from a cpus_read_lock() protected region */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; struct parallel_data *pd = NULL; @@ -965,7 +954,6 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - get_online_cpus(); if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { @@ -989,14 +977,12 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; - put_online_cpus(); - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif return pinst; @@ -1005,12 +991,27 @@ err_free_masks: free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); - put_online_cpus(); err: return NULL; } /** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * + * Must be called from a cpus_read_lock() protected region + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + lockdep_assert_cpus_held(); + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** * padata_free - free a padata instance * * @padata_inst: padata instance to free diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8603a48fcdea..fc47863f629c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1175,7 +1175,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..be90c945063f --- /dev/null +++ b/kernel/rcu/Kconfig @@ -0,0 +1,242 @@ +# +# RCU-related configuration options +# + +menu "RCU Subsystem" + +config TREE_RCU + bool + default y if !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config PREEMPT_RCU + bool + default y if PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool + default y if !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config RCU_EXPERT + bool "Make expert-level adjustments to RCU configuration" + default n + help + This option needs to be enabled if you wish to make + expert-level adjustments to RCU configuration. By default, + no such adjustments can be made, which has the often-beneficial + side-effect of preventing "make oldconfig" from asking you all + sorts of detailed questions about how you would like numerous + obscure RCU options to be set up. + + Say Y if you need to make expert-level adjustments to RCU. + + Say N if you are unsure. + +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU + help + This option selects the full-fledged version of SRCU. + +config TASKS_RCU + bool + default n + select SRCU + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || PREEMPT_RCU ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. + + Take the default if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP && RCU_EXPERT + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug @@ -0,0 +1,82 @@ +# +# RCU-related debugging configuration options +# + +menu "RCU Debugging" + +config PROVE_RCU + def_bool PROVE_LOCKING + +config TORTURE_TEST + tristate + default n + +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to be built into + the kernel. + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_STALL_COMMON + range 3 300 + default 21 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on DEBUG_KERNEL + default y if TREE_RCU + select TRACE_CLOCK + help + This option enables additional tracepoints for ftrace-style + event tracing. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_EQS_DEBUG + bool "Provide debugging asserts for adding NO_HZ support to an arch" + depends on DEBUG_KERNEL + help + This option provides consistency checks in RCU's handling of + NO_HZ. These checks have proven quite helpful in detecting + bugs in arch-specific NO_HZ code. + + Say N here if you need ultimate kernel/user switch latencies + Say Y if you are unsure + +endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 23803c7d5180..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,13 +3,11 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_CLASSIC_SRCU) += srcu.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 73e16ec4054b..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + void rcu_early_boot_tests(void); void rcu_test_sync_prims(void); @@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) cpu <= rnp->grphi; \ cpu = cpumask_next((cpu), cpu_possible_mask)) +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + +#ifdef CONFIG_TINY_RCU +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +#else /* #ifdef CONFIG_TINY_RCU */ +void rcu_request_urgent_qs_task(struct task_struct *t); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); +void rcutorture_record_test_transition(void); +void rcutorture_record_progress(unsigned long vernum); +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#ifdef CONFIG_RCU_TRACE +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif +#endif + +#ifdef CONFIG_TINY_SRCU + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_idx; + *gpnum = *completed; +} + +#elif defined(CONFIG_TREE_SRCU) + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#endif + +#ifdef CONFIG_TINY_RCU + +/* + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. + */ +static inline unsigned long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods completed. + */ +static inline unsigned long rcu_batches_completed_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods completed. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + +static inline void show_rcu_gp_kthreads(void) +{ +} + +#else /* #ifdef CONFIG_TINY_RCU */ +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); +unsigned long srcu_batches_completed(struct srcu_struct *sp); +void show_rcu_gp_kthreads(void); +void rcu_force_quiescent_state(void); +void rcu_bh_force_quiescent_state(void); +void rcu_sched_force_quiescent_state(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +#ifdef CONFIG_RCU_NOCB_CPU +bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -48,6 +48,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); @@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nreaders, 0, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, shutdown, !IS_ENABLED(MODULE), + "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); static int rcu_perf_writer_state; #define RTWS_INIT 0 -#define RTWS_EXP_SYNC 1 -#define RTWS_SYNC 2 -#define RTWS_IDLE 2 -#define RTWS_STOPPING 3 +#define RTWS_ASYNC 1 +#define RTWS_BARRIER 2 +#define RTWS_EXP_SYNC 3 +#define RTWS_SYNC 4 +#define RTWS_IDLE 5 +#define RTWS_STOPPING 6 #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -114,6 +123,8 @@ struct rcu_perf_ops { unsigned long (*started)(void); unsigned long (*completed)(void); unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); const char *name; @@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = { .started = rcu_batches_started, .completed = rcu_batches_completed, .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .name = "rcu" @@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_bh, + .gp_barrier = rcu_barrier_bh, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, .name = "rcu_bh" @@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void) return srcu_batches_completed(srcu_ctlp); } +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + static void srcu_perf_synchronize(void) { synchronize_srcu(srcu_ctlp); @@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = { .started = NULL, .completed = srcu_perf_completed, .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, .sync = srcu_perf_synchronize, .exp_sync = srcu_perf_synchronize_expedited, .name = "srcu" }; +static struct srcu_struct srcud; + +static void srcu_sync_perf_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_perf_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_perf_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_perf_init, + .cleanup = srcu_sync_perf_cleanup, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcud" +}; + /* * Definitions for sched perf testing. */ @@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = { .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_sched, + .gp_barrier = rcu_barrier_sched, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, .name = "sched" @@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = { .readunlock = tasks_perf_read_unlock, .started = rcu_no_completed, .completed = rcu_no_completed, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .name = "tasks" @@ -344,6 +404,15 @@ rcu_perf_reader(void *arg) } /* + * Callback function for asynchronous grace periods from rcu_perf_writer(). + */ +static void rcu_perf_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + +/* * RCU perf writer kthread. Repeatedly does a grace period. */ static int @@ -352,6 +421,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct rcu_head *rhp = NULL; struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; @@ -380,9 +450,27 @@ rcu_perf_writer(void *arg) } do { + if (writer_holdoff) + udelay(writer_holdoff); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_exp) { + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + rcu_perf_writer_state = RTWS_ASYNC; + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_perf_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { @@ -429,6 +517,10 @@ rcu_perf_writer(void *arg) i++; rcu_perf_wait_shutdown(); } while (!torture_must_stop()); + if (gp_async) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + } rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); @@ -452,6 +544,17 @@ rcu_perf_cleanup(void) u64 *wdp; u64 *wdpp; + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + if (torture_cleanup_begin()) return; @@ -554,7 +657,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, RCUPERF_TASKS_OPS }; @@ -624,16 +727,6 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } - if (rcu_gp_is_normal() && gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ae6e574d4cf5..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,8 @@ #include <linux/torture.h> #include <linux/vmalloc.h> +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); @@ -562,31 +564,19 @@ static void srcu_torture_stats(void) int __maybe_unused cpu; int idx; -#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) #ifdef CONFIG_TREE_SRCU idx = srcu_ctlp->srcu_idx & 0x1; -#else /* #ifdef CONFIG_TREE_SRCU */ - idx = srcu_ctlp->completed & 0x1; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; -#ifdef CONFIG_TREE_SRCU struct srcu_data *counts; counts = per_cpu_ptr(srcu_ctlp->sda, cpu); u0 = counts->srcu_unlock_count[!idx]; u1 = counts->srcu_unlock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - struct srcu_array *counts; - - counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - u0 = counts->unlock_count[!idx]; - u1 = counts->unlock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -594,13 +584,8 @@ static void srcu_torture_stats(void) */ smp_rmb(); -#ifdef CONFIG_TREE_SRCU l0 = counts->srcu_lock_count[!idx]; l1 = counts->srcu_lock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - l0 = counts->lock_count[!idx]; - l1 = counts->lock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; @@ -609,7 +594,7 @@ static void srcu_torture_stats(void) pr_cont("\n"); #elif defined(CONFIG_TINY_SRCU) idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", torture_type, TORTURE_FLAG, idx, READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index dea03614263f..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney <paulmck@us.ibm.com> - * Lai Jiangshan <laijs@cn.fujitsu.com> - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include <linux/export.h> -#include <linux/mutex.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/rcupdate_wait.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/delay.h> -#include <linux/srcu.h> - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->lock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[idx]); - } - return sum; -} - -/* - * Returns approximate total of the readers' ->unlock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->unlock_count[idx]); - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be zero. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long unlocks; - - unlocks = srcu_readers_unlock_idx(sp, idx); - - /* - * Make sure that a lock is always counted if the corresponding unlock - * is counted. Needs to be a smp_mb() as the read side may contain a - * read from a variable that is written to before the synchronize_srcu() - * in the write side. In this case smp_mb()s A and B act like the store - * buffering pattern. - * - * This smp_mb() also pairs with smp_mb() C to prevent accesses after the - * synchronize_srcu() from being executed before the grace period ends. - */ - smp_mb(); /* A */ - - /* - * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does not - * mean that there are no more readers, as one could have read the - * current index but not have incremented the lock counter yet. - * - * Possible bug: There is no guarantee that there haven't been ULONG_MAX - * increments of ->lock_count[] since the unlocks were counted, meaning - * that this could return true even if there are still active readers. - * Since there are no memory barriers around srcu_flip(), the CPU is not - * required to increment ->completed before running - * srcu_readers_unlock_idx(), which means that there could be an - * arbitrarily large number of critical sections that execute after - * srcu_readers_unlock_idx() but use the old value of ->completed. - */ - return srcu_readers_lock_idx(sp, idx) == unlocks; -} - -/** - * srcu_readers_active - returns true if there are readers. and false - * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static bool srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[0]); - sum += READ_ONCE(cpuc->lock_count[1]); - sum -= READ_ONCE(cpuc->unlock_count[0]); - sum -= READ_ONCE(cpuc->unlock_count[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this only after you are finished using a given srcu_struct - * that was initialized via init_srcu_struct(). This code does some - * probabalistic checking, spotting late uses of srcu_read_lock(), - * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). - * If any such late uses are detected, the per-CPU memory associated with - * the srcu_struct is simply leaked and WARN_ON() is invoked. If the - * caller frees the srcu_struct itself, a use-after-free crash will likely - * ensue, but at least there will be a warning printed. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->completed) & 0x1; - this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->(un)lock_count[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - WRITE_ONCE(sp->completed, sp->completed + 1); - - /* - * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). - */ - smp_mb(); /* D */ /* Pairs with C. */ -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing SRCU read-side critical section. On systems with - * more than one CPU, this means that when "func()" is invoked, each CPU - * is guaranteed to have executed a full memory barrier since the end of - * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing - * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() - * but before the beginning of that SRCU read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting SRCU callback function "func()", then both CPU A and CPU - * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". - * This guarantee applies even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * Of course, these guarantees apply only for invocations of call_srcu(), - * srcu_read_lock(), and srcu_read_unlock() that are all passed the same - * srcu_struct structure. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - rcu_callback_t func) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - queue_delayed_work(system_power_efficient_wq, &sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || - lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) { - wait_for_completion(&rcu.completion); - smp_mb(); /* Caller's later accesses after GP. */ - } - -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section, - * as long as the resulting graph of srcu_structs is acyclic. - * - * There are memory-ordering constraints implied by synchronize_srcu(). - * On systems with more than one CPU, when synchronize_srcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section - * whose beginning preceded the call to synchronize_srcu(). In addition, - * each CPU having an SRCU read-side critical section that extends beyond - * the return from synchronize_srcu() is guaranteed to have executed a - * full memory barrier after the beginning of synchronize_srcu() and before - * the beginning of that SRCU read-side critical section. Note that these - * guarantees include CPUs that are offline, idle, or executing in user mode, - * as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_srcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_srcu(). This guarantee applies even if CPU A and CPU B - * are the same CPU, but again only if the system has more than one CPU. - * - * Of course, these memory-ordering guarantees apply only when - * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are - * passed the same srcu_struct structure. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that synchronize_srcu_expedited() has the same deadlock and - * memory-ordering properties as does synchronize_srcu(). - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. Note that needed memory barriers have been executed - * in this task's context by srcu_readers_active_idx_check(). - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - queue_delayed_work(system_power_efficient_wq, - &sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32798eb14853..1a1c1047d2ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_lock_nesting[0] = 0; sp->srcu_lock_nesting[1] = 0; init_swait_queue_head(&sp->srcu_wq); - sp->srcu_gp_seq = 0; - rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; sp->srcu_gp_running = false; sp->srcu_gp_waiting = false; sp->srcu_idx = 0; @@ -88,30 +88,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); flush_work(&sp->srcu_work); - WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); - WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); + WARN_ON(sp->srcu_cb_head); + WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Can be invoked from irq/bh handlers, but the matching - * __srcu_read_unlock() must be in the same handler instance. Returns an - * index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* * Removes the count for the old reader from the appropriate element of * the srcu_struct. */ @@ -133,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); void srcu_drive_gp(struct work_struct *wp) { int idx; - struct rcu_cblist ready_cbs; - struct srcu_struct *sp; + struct rcu_head *lh; struct rcu_head *rhp; + struct srcu_struct *sp; sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) return; /* Already running or nothing to do. */ - /* Tag recently arrived callbacks and wait for readers. */ + /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(sp->srcu_gp_running, true); - rcu_segcblist_accelerate(&sp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); - rcu_seq_start(&sp->srcu_gp_seq); + local_irq_disable(); + lh = sp->srcu_cb_head; + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + local_irq_enable(); idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ - rcu_seq_end(&sp->srcu_gp_seq); - - /* Update callback list based on GP, and invoke ready callbacks. */ - rcu_segcblist_advance(&sp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { - rcu_cblist_init(&ready_cbs); - local_irq_disable(); - rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); - rhp = rcu_cblist_dequeue(&ready_cbs); - for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - } - local_irq_disable(); - rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); + + /* Invoke the callbacks we removed above. */ + while (lh) { + rhp = lh; + lh = lh->next; + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); } - WRITE_ONCE(sp->srcu_gp_running, false); /* - * If more callbacks, reschedule ourselves. This can race with - * a call_srcu() at interrupt level, but the ->srcu_gp_running - * checks will straighten that out. + * Enable rescheduling, and if there are more callbacks, + * reschedule ourselves. This can race with a call_srcu() + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. */ - if (!rcu_segcblist_empty(&sp->srcu_cblist)) + WRITE_ONCE(sp->srcu_gp_running, false); + if (READ_ONCE(sp->srcu_cb_head)) schedule_work(&sp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -187,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; - head->func = func; + rhp->func = func; + rhp->next = NULL; local_irq_save(flags); - rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + *sp->srcu_cb_tail = rhp; + sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); if (!READ_ONCE(sp->srcu_gp_running)) schedule_work(&sp->srcu_work); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 157654fa436a..d0ca524bf042 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,9 +40,15 @@ #include "rcu.h" #include "rcu_segcblist.h" -ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +/* Holdoff in nanoseconds for auto-expediting. */ +#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) +static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); +/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ +static ulong counter_wrap_check = (ULONG_MAX >> 2); +module_param(counter_wrap_check, ulong, 0444); + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); @@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_init(&snp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_init(&sdp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * to each update-side SRCU primitive. Use sp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ @@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * not mean that there are no more readers, as one could have read * the current index but not have incremented the lock counter yet. * - * Possible bug: There is no guarantee that there haven't been - * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were - * counted, meaning that this could return true even if there are - * still active readers. Since there are no memory barriers around - * srcu_flip(), the CPU is not required to increment ->srcu_idx - * before running srcu_readers_unlock_idx(), which means that there - * could be an arbitrarily large number of critical sections that - * execute after srcu_readers_unlock_idx() but use the old value - * of ->srcu_idx. + * So suppose that the updater is preempted here for so long + * that more than ULONG_MAX non-nested readers come and go in + * the meantime. It turns out that this cannot result in overflow + * because if a reader modifies its unlock count after we read it + * above, then that reader's next load of ->srcu_idx is guaranteed + * to get the new value, which will cause it to operate on the + * other bank of counters, where it cannot contribute to the + * overflow of these counters. This means that there is a maximum + * of 2*NR_CPUS increments, which cannot overflow given current + * systems, especially not on 64-bit systems. + * + * OK, how about nesting? This does impose a limit on nesting + * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, + * especially on 64-bit systems. */ return srcu_readers_lock_idx(sp, idx) == unlocks; } @@ -400,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), - "Invoked srcu_gp_start() without ->gp_lock!"); + lockdep_assert_held(&sp->lock); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -489,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + int cpu; + unsigned long flags; unsigned long gpseq; int idx; int idxnext; unsigned long mask; + struct srcu_data *sdp; struct srcu_node *snp; /* Prevent more than one additional grace period. */ mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -508,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -516,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_irq(&snp->lock); + raw_spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -526,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq(&snp->lock); - if (cbs) { - smp_mb(); /* GP end before CB invocation. */ + raw_spin_unlock_irq_rcu_node(snp); + if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); - } + + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + sdp = per_cpu_ptr(sp->sda, cpu); + raw_spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + } } /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); } } @@ -567,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -600,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { - smp_mb(); /* CBs after GP! */ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -622,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -645,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -671,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) */ static void srcu_flip(struct srcu_struct *sp) { + /* + * Ensure that if this updater saw a given reader's increment + * from __srcu_read_lock(), that reader was using an old value + * of ->srcu_idx. Also ensure that if a given reader sees the + * new value of ->srcu_idx, this updater's earlier scans cannot + * have seen that reader's increments (which is OK, because this + * grace period need not wait on that reader). + */ + smp_mb(); /* E */ /* Pairs with B and C. */ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); /* @@ -745,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) } /* + * SRCU callback function to leak a callback. + */ +static void srcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating * grace-period processing if it is not already running. @@ -782,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, struct srcu_data *sdp; check_init_srcu_struct(sp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ + WRITE_ONCE(rhp->func, srcu_leak_callback); + WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); + return; + } rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - spin_lock(&sdp->lock); + raw_spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -799,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) srcu_funnel_exp_start(sp, sdp->mynode, s); } +/** + * call_srcu() - Queue a callback for invocation after an SRCU grace period + * @sp: srcu_struct in queue the callback + * @head: structure to be used for queueing the SRCU callback. + * @func: function to be invoked after the SRCU grace period + * + * The callback function will be invoked some time after a full SRCU + * grace period elapses, in other words after all pre-existing SRCU + * read-side critical sections have completed. However, the callback + * function might well execute concurrently with other SRCU read-side + * critical sections that started after call_srcu() was invoked. SRCU + * read-side critical sections are delimited by srcu_read_lock() and + * srcu_read_unlock(), and may be nested. + * + * The callback will be invoked from process context, but must nevertheless + * be fast and must not block. + */ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { @@ -953,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) + &sdp->srcu_barrier_head, 0)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); - spin_unlock_irq(&sdp->lock); + } + raw_spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1008,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1067,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - spin_lock_irq(&sdp->lock); - smp_mb(); /* Old grace periods before callback invocation! */ + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + debug_rcu_head_unqueue(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1092,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1111,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1121,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); @@ -1152,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Hierarchical SRCU implementation.\n"); + if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) + pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + return 0; +} +early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index e5385731e391..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,15 +35,26 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/trace_events.h> #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ -struct rcu_ctrlblk; -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp); +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; #include "tiny_plugin.h" @@ -59,19 +70,6 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL(rcu_barrier_sched); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool notrace __rcu_is_watching(void) -{ - return true; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers @@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -125,7 +122,6 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -140,10 +136,8 @@ void rcu_check_callbacks(int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(rn, list); + __rcu_reclaim("", list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) @@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) - rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 371034e77f87..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -22,36 +22,6 @@ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -#include <linux/kthread.h> -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include <linux/kernel_stat.h> @@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void) } #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutiny_trace_init); - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = READ_ONCE(rcp->jiffies_stall); - if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - WRITE_ONCE(rcp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - } else if (ULONG_CMP_GE(j, js)) { - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); - } -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e354e475e645..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ -#ifdef CONFIG_RCU_KTHREAD_PRIO -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; -module_param(gp_preinit_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ -static const int gp_preinit_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT -static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; -module_param(gp_init_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -static const int gp_init_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; -module_param(gp_cleanup_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ -static const int gp_cleanup_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static int gp_preinit_delay; +module_param(gp_preinit_delay, int, 0444); +static int gp_init_delay; +module_param(gp_init_delay, int, 0444); +static int gp_cleanup_delay; +module_param(gp_cleanup_delay, int, 0444); /* * Number of grace periods between delays, normalized by the duration of @@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -265,6 +248,7 @@ void rcu_sched_qs(void) void rcu_bh_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), @@ -286,10 +270,6 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; /* @@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt) barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(); + rcu_preempt_note_context_switch(preempt); /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) goto out; @@ -534,9 +514,12 @@ void rcu_all_qs(void) } EXPORT_SYMBOL_GPL(rcu_all_qs); -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +static long blimit = DEFAULT_RCU_BLIMIT; +#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +static long qhimark = DEFAULT_RCU_QHIMARK; +#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +static long qlowmark = DEFAULT_RCU_QLOMARK; module_param(blimit, long, 0444); module_param(qhimark, long, 0444); @@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); return READ_ONCE(*fp); } @@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -864,7 +847,6 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -914,7 +896,6 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); rdtp->dynticks_nesting--; } - rcu_sysidle_enter(1); } /* @@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -995,7 +977,6 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -1047,7 +1028,6 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(oldval, true); - rcu_sysidle_exit(1); } /* @@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void) } /** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - -/** * rcu_is_watching - see if RCU thinks that the current CPU is idle * - * If the current CPU is in its idle loop and is neither in an interrupt + * Return true if RCU is watching the running CPU, which means that this + * CPU can safely enter RCU read-side critical sections. In other words, + * if the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ bool notrace rcu_is_watching(void) @@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = __rcu_is_watching(); + ret = !rcu_dynticks_curr_cpu_in_eqs(); preempt_enable_notrace(); return ret; } @@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, @@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; bool *rnhqp; @@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { + lockdep_assert_held(&rnp->lock); + /* * If RCU is idle, we just wait for the next grace period. * But we can only be sure that RCU is idle if we are looking @@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + lockdep_assert_held(&rnp->lock); + /* * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. @@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; + lockdep_assert_held(&rnp->lock); + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && !unlikely(READ_ONCE(rdp->gpwrap))) { @@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) */ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - bool isidle = false; - unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = true; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); + force_qs_rnp(rsp, dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = true; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2341,6 +2308,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + lockdep_assert_held(&rcu_get_root(rsp)->lock); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; + lockdep_assert_held(&rnp->lock); + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { @@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; + lockdep_assert_held(&rnp->lock); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2599,6 +2571,8 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs do not have orphanable callbacks. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; @@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs are handled specially. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) { int cpu; unsigned long flags; @@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + if (f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } } @@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ + /* + * Probable double call_rcu(), so leak the callback. + * Use rcu:rcu_callback trace event to find the previous + * time callback was passed to __call_rcu(). + */ + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } head->func = func; @@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, local_irq_restore(flags); } -/* - * Queue an RCU-sched callback for invocation after a grace period. +/** + * call_rcu_sched() - Queue an RCU for invocation after sched grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_sched() assumes + * that the read-side critical sections end on enabling of preemption + * or on voluntary preemption. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. + * + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { @@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * Queue an RCU callback for invocation after a quicker grace period. +/** + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. + * OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { @@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. */ void synchronize_sched(void) { @@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); + rdp->barrier_head.func = rcu_barrier_callback; + debug_rcu_head_queue(&rdp->barrier_head); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + atomic_inc(&rsp->barrier_cpu_count); + } else { + debug_rcu_head_unqueue(&rdp->barrier_head); + _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + } } /* @@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ba38262c3554..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -45,14 +45,6 @@ struct rcu_dynticks { bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -160,19 +152,6 @@ struct rcu_node { /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -312,9 +291,9 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOGP_WAKE_NOT 0 -#define RCU_NOGP_WAKE 1 -#define RCU_NOGP_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_NOT 0 +#define RCU_NOCB_WAKE 1 +#define RCU_NOCB_WAKE_FORCE 2 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(void); +static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); @@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(int irq); -static void rcu_sysidle_exit(int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); @@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ #endif /* #ifndef RCU_TREE_NONCORE */ - -#ifdef CONFIG_RCU_TRACE -/* Read out queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ -#ifdef CONFIG_RCU_NOCB_CPU - *ql = atomic_long_read(&rdp->nocb_q_count); - *qll = atomic_long_read(&rdp->nocb_q_count_lazy); -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - *ql = 0; - *qll = 0; -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -} -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Wrappers for the rcu_node::lock acquire and release. - * - * Because the rcu_nodes form a tree, the tree traversal locking will observe - * different lock values, this in turn means that an UNLOCK of one level - * followed by a LOCK of another level does not imply a full memory barrier; - * and most importantly transitivity is lost. - * - * In order to restore full ordering between tree levels, augment the regular - * lock acquire functions with smp_mb__after_unlock_lock(). - * - * As ->lock of struct rcu_node is a __private field, therefore one should use - * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. - */ -static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); -} - -static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} - -static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); -} - -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) - -static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) -{ - bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); - - if (locked) - smp_mb__after_unlock_lock(); - return locked; -} diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e513b4ab1197..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * * Caller must hold the rcu_state's exp_mutex. */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (gp_preinit_delay) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (gp_init_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (gp_cleanup_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU @@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -286,12 +313,14 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { @@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } @@ -835,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2023,6 +2054,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); @@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 6cea17a1ea30..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Read-Copy Update tracing for hierarchical implementation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright IBM Corporation, 2008 - * Author: Paul E. McKenney - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> -#include <linux/prefetch.h> - -#define RCU_TREE_NONCORE -#include "tree.h" -#include "rcu.h" - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d bseq: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->barrier_sequence); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), - rdp->core_needs_qs); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - rcu_dynticks_snap(rdp->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); - ql += rcu_segcblist_n_cbs(&rdp->cblist); - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], - ".R"[!rcu_segcblist_segempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)], - ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], - ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - int cpu; - struct rcu_state *rsp = (struct rcu_state *)m->private; - struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->exp_workdone0); - s1 += atomic_long_read(&rdp->exp_workdone1); - s2 += atomic_long_read(&rdp->exp_workdone2); - s3 += atomic_long_read(&rdp->exp_workdone3); - } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, - atomic_read(&rsp->expedited_need_qs), - rsp->expedited_sequence / 2); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->gp_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), - rsp->orphan_done.len_lazy, - rsp->orphan_done.len); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - completed = READ_ONCE(rsp->completed); - gpnum = READ_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_core_needs_qs, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_nocb_defer_wakeup, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 273e869ca21d..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,7 +62,9 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU +extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); +extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); @@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; + int j; /* Initialize and register callbacks for each flavor specified. */ for (i = 0; i < n; i++) { @@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } init_rcu_head_on_stack(&rs_array[i].head); init_completion(&rs_array[i].completion); - (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); } /* Wait for all callbacks to be invoked. */ @@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, (crcu_array[i] == call_rcu || crcu_array[i] == call_rcu_bh)) continue; - wait_for_completion(&rs_array[i].completion); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + wait_for_completion(&rs_array[i].completion); destroy_rcu_head_on_stack(&rs_array[i].head); } } @@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; -/* - * Post an RCU-tasks callback. First call must be from process context - * after the scheduler if fully operational. +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { @@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PROVE_RCU /* @@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests); #else void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any significant non-default boot-time settings. + */ +void __init rcupdate_announce_bootup_oddness(void) +{ + if (rcu_normal) + pr_info("\tNo expedited grace period (rcu_normal).\n"); + else if (rcu_normal_after_boot) + pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); + else if (rcu_expedited) + pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); + if (rcu_cpu_stall_suppress) + pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); + if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) + pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); + rcu_tasks_bootup_oddness(); +} + +#endif /* #ifndef CONFIG_TINY_RCU */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..53f0164ed362 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-y += idle_task.o fair.o rt.o deadline.o +obj-y += wait.o wait_bit.o swait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/tick.h> +#include <linux/init.h> /* * Scheduler clock - returns current time in nanosec units. @@ -124,14 +125,27 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, @@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -183,7 +211,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -197,7 +229,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account @@ -347,21 +382,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); + scd = this_scd(); + __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; + + if (!sched_clock_stable()) + return; + /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. + * Called under watchdog_lock. * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. */ - scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); - - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* @@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); - touch_softlockup_watchdog_sched(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 53f9558fa925..13fc5ae9bf2f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_WAITQUEUE(wait, current); - __add_wait_queue_tail_exclusive(&x->wait, &wait); + __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 326d4f88e2b1..04727421f56b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10,6 +10,7 @@ #include <uapi/linux/sched/types.h> #include <linux/sched/loadavg.h> #include <linux/sched/hotplug.h> +#include <linux/wait_bit.h> #include <linux/cpuset.h> #include <linux/delayacct.h> #include <linux/init_task.h> @@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - /* * __normal_prio - return the priority that is based on the static prio */ @@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, @@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } @@ -2159,9 +2151,11 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; } /* @@ -2193,6 +2187,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2430,7 +2425,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; + return BW_UNIT; /* * Doing this here saves a lot of checks in all @@ -2440,7 +2435,7 @@ unsigned long to_ratio(u64 period, u64 runtime) if (period == 0) return 0; - return div64_u64(runtime << 20, period); + return div64_u64(runtime << BW_SHIFT, period); } #ifdef CONFIG_SMP @@ -2451,7 +2446,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->rd->dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; @@ -2469,7 +2464,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->dl.dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { return 1; } @@ -2482,9 +2477,6 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. - * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2509,15 +2501,29 @@ static int dl_overflow(struct task_struct *p, int policy, cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { - __dl_add(dl_b, new_bw); + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ err = 0; } raw_spin_unlock(&dl_b->lock); @@ -3687,7 +3693,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr->private, mode, wake_flags); @@ -4026,26 +4032,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - - /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. - * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. - */ + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } /* @@ -4197,8 +4184,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { @@ -4211,7 +4198,8 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + if (attr->sched_flags & + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) return -EINVAL; /* @@ -5530,7 +5518,7 @@ int task_can_attach(struct task_struct *p, * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ - __dl_add(dl_b, p->dl.dl_bw); + __dl_add(dl_b, p->dl.dl_bw, cpus); } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); @@ -5874,15 +5862,9 @@ int sched_cpu_deactivate(unsigned int cpu) * users of this state to go away such that all new such users will * observe it. * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * * Do sync before park smpboot threads to take care the rcu boost case. */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + synchronize_rcu_mult(call_rcu, call_rcu_sched); if (!sched_smp_initialized) return 0; @@ -5958,7 +5940,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5968,7 +5949,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -5984,7 +5965,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -6000,7 +5980,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -6026,28 +6005,13 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); -#define WAIT_TABLE_BITS 8 -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - unsigned long val = (unsigned long)word << shift | bit; - - return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); -} -EXPORT_SYMBOL(bit_waitqueue); - void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; sched_clock_init(); - - for (i = 0; i < WAIT_TABLE_SIZE; i++) - init_waitqueue_head(bit_wait_table + i); + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); @@ -6199,7 +6163,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); @@ -6251,8 +6214,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; @@ -6777,6 +6742,19 @@ static int sched_dl_global_validate(void) return ret; } +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); + } +} + static void sched_dl_do_global(void) { u64 new_bw = -1; @@ -6802,6 +6780,7 @@ static void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..e12f85975857 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,222 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static inline +void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw += dl_bw; + SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +static inline +void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + if (dl_rq->running_bw > old) + dl_rq->running_bw = 0; +} + +static inline +void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw += dl_bw; + SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ +} + +static inline +void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + if (dl_rq->this_bw > old) + dl_rq->this_bw = 0; + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + struct rq *rq; + + if (task_on_rq_queued(p)) + return; + + rq = task_rq(p); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(new_bw, &rq->dl); +} + +/* + * The utilization of a task cannot be immediately removed from + * the rq active utilization (running_bw) when the task blocks. + * Instead, we have to wait for the so called "0-lag time". + * + * If a task blocks before the "0-lag time", a timer (the inactive + * timer) is armed, and running_bw is decreased when the timer + * fires. + * + * If the task wakes up again before the inactive timer fires, + * the timer is cancelled, whereas if the task wakes up after the + * inactive timer fired (and running_bw has been decreased) the + * task's utilization has to be added to running_bw again. + * A flag in the deadline scheduling entity (dl_non_contending) + * is used to avoid race conditions between the inactive timer handler + * and task wakeups. + * + * The following diagram shows how running_bw is updated. A task is + * "ACTIVE" when its utilization contributes to running_bw; an + * "ACTIVE contending" task is in the TASK_RUNNING state, while an + * "ACTIVE non contending" task is a blocked task for which the "0-lag time" + * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" + * time already passed, which does not contribute to running_bw anymore. + * +------------------+ + * wakeup | ACTIVE | + * +------------------>+ contending | + * | add_running_bw | | + * | +----+------+------+ + * | | ^ + * | dequeue | | + * +--------+-------+ | | + * | | t >= 0-lag | | wakeup + * | INACTIVE |<---------------+ | + * | | sub_running_bw | | + * +--------+-------+ | | + * ^ | | + * | t < 0-lag | | + * | | | + * | V | + * | +----+------+------+ + * | sub_running_bw | ACTIVE | + * +-------------------+ | + * inactive timer | non contending | + * fired +------------------+ + * + * The task_non_contending() function is invoked when a task + * blocks, and checks if the 0-lag time already passed or + * not (in the first case, it directly updates running_bw; + * in the second case, it arms the inactive timer). + * + * The task_contending() function is invoked when a task wakes + * up, and checks if the task is still in the "ACTIVE non contending" + * state or not (in the second case, it updates running_bw). + */ +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_clear_params(p); + raw_spin_unlock(&dl_b->lock); + } + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se, int flags) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (flags & ENQUEUE_MIGRATED) + add_rq_bw(dl_se->dl_bw, dl_rq); + + if (dl_se->dl_non_contending) { + dl_se->dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -83,6 +299,10 @@ void init_dl_rq(struct dl_rq *dl_rq) #else init_dl_bw(&dl_rq->dl_bw); #endif + + dl_rq->running_bw = 0; + dl_rq->this_bw = 0; + init_dl_rq_bw_ratio(dl_rq); } #ifdef CONFIG_SMP @@ -484,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. + * + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -500,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -593,10 +892,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -723,6 +1020,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } @@ -735,6 +1034,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* + * This function implements the GRUB accounting rule: + * according to the GRUB reclaiming algorithm, the runtime is + * not decreased as "dq = -dt", but as + * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * where u is the utilization of the task, Umax is the maximum reclaimable + * utilization, Uinact is the (per-runqueue) inactive utilization, computed + * as the difference between the "total runqueue utilization" and the + * runqueue active utilization, and Uextra is the (per runqueue) extra + * reclaimable utilization. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations + * multiplied by 2^BW_SHIFT, the result has to be shifted right by + * BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. + */ +u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +{ + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ + u64 u_act; + u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + + /* + * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, + * we compare u_inact + rq->dl.extra_bw with + * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because + * u_inact + rq->dl.extra_bw can be larger than + * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative + * leading to wrong results) + */ + if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) + u_act = u_act_min; + else + u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + + return (delta * u_act) >> BW_SHIFT; +} + +/* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). */ @@ -776,6 +1116,8 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) + delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; throttle: @@ -815,6 +1157,56 @@ throttle: } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -946,10 +1338,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) { + task_contending(dl_se, flags); update_dl_entity(dl_se, pi_se); - else if (flags & ENQUEUE_REPLENISH) + } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } __enqueue_dl_entity(dl_se); } @@ -959,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -995,17 +1384,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + add_running_bw(p->dl.dl_bw, &rq->dl); + } + /* - * If p is throttled, we do nothing. In fact, if it exhausted + * If p is throttled, we do not enqueue it. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl, flags); + return; + } enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1023,6 +1427,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); + + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(p->dl.dl_bw, &rq->dl); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(p); } /* @@ -1100,6 +1521,37 @@ out: return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + struct rq *rq; + + if (p->state != TASK_WAKING) + return; + + rq = task_rq(p); + /* + * Since p->state == TASK_WAKING, set_task_cpu() has been called + * from try_to_wake_up(). Hence, p->pi_lock is locked, but + * rq->lock is not... So, lock it + */ + raw_spin_lock(&rq->lock); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_unlock(&rq->lock); +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1255,19 +1707,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void task_dead_dl(struct task_struct *p) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - /* - * Since we are TASK_DEAD we won't slip out of the domain! - */ - raw_spin_lock_irq(&dl_b->lock); - /* XXX we should retain the bw until 0-lag */ - dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); -} - static void set_curr_task_dl(struct rq *rq) { struct task_struct *p = rq->curr; @@ -1533,7 +1972,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. @@ -1551,7 +1990,11 @@ retry: } deactivate_task(rq, next_task, 0); + sub_running_bw(next_task->dl.dl_bw, &rq->dl); + sub_rq_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); + add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1639,7 +2082,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + sub_running_bw(p->dl.dl_bw, &src_rq->dl); + sub_rq_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_rq_bw(p->dl.dl_bw, &this_rq->dl); + add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1695,7 +2142,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw); + __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -1737,13 +2184,26 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + if (!task_on_rq_queued(p)) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1762,11 +2222,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ - if (!task_on_rq_queued(p)) - return; + if (!task_on_rq_queued(p)) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + return; + } /* * If p is boosted we already updated its params in * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), @@ -1836,6 +2300,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, @@ -1845,7 +2310,6 @@ const struct sched_class dl_sched_class = { .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, - .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c77e4b1d51c0..694c258b8771 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); @@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } @@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5721,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5809,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) @@ -6168,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6177,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) @@ -6970,10 +6972,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6983,7 +7003,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6997,6 +7017,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -7229,7 +7256,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7408,7 +7435,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7572,7 +7599,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7927,7 +7954,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8033,7 +8060,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* @@ -8043,11 +8069,9 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; @@ -8083,7 +8107,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -8659,6 +8683,10 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!is_housekeeping_cpu(cpu)) + return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -9523,10 +9551,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ef63adce0c9c..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void do_idle(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * - * The basic idea is to fold the nr_active delta into a global idle-delta upon + * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * - * We avoid this by keeping two idle-delta counters and flipping them + * We avoid this by keeping two NO_HZ-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * - * This ensures we'll fold the old idle contribution in this window while + * This ensures we'll fold the old NO_HZ contribution in this window while * accumlating the new one. * - * - When we wake up from NO_HZ idle during the window, we push up our + * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which + * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. + * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_idle[2]; +static atomic_long_t calc_load_nohz[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) /* * If the folding window started, make sure we start writing in the - * next idle-delta. + * next NO_HZ-delta. */ if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_enter_idle(void) +void calc_load_nohz_start(void) { struct rq *this_rq = this_rq(); long delta; /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. */ delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); + atomic_long_add(delta, &calc_load_nohz[idx]); } } -void calc_load_exit_idle(void) +void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_fold_idle(void) +static long calc_load_nohz_fold(void) { int idx = calc_load_read_idx(); long delta = 0; - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); + if (atomic_long_read(&calc_load_nohz[idx])) + delta = atomic_long_xchg(&calc_load_nohz[idx], 0); return delta; } @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, /* * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. + * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into + * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold + * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. @@ -330,7 +330,7 @@ static void calc_global_nohz(void) } /* - * Flip the idle index... + * Flip the NO_HZ index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new @@ -341,7 +341,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_fold_idle(void) { return 0; } +static inline long calc_load_nohz_fold(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. */ - delta = calc_load_fold_idle(); + delta = calc_load_nohz_fold(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + * In case we went to NO_HZ for multiple LOAD_FREQ intervals + * catch up in bulk. */ calc_global_nohz(); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..581d5c7a5264 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { @@ -1819,7 +1830,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..e0329d10bdb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -39,9 +39,9 @@ #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void) } extern struct dl_bw *dl_bw_of(int i); +extern int dl_bw_cpus(int i); struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -244,6 +249,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -558,6 +564,30 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -606,11 +636,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -1025,7 +1053,11 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - unsigned long cpumask[0]; /* iteration mask */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1046,16 +1078,15 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } @@ -1066,7 +1097,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); @@ -1422,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1486,7 +1521,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1928,6 +1968,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_span(group))); + + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { + printk(KERN_CONT " mask=%*pbl", + cpumask_pr_args(group_balance_mask(group))); + } + + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); + + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_span(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); @@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -477,46 +496,214 @@ enum s_alloc { }; /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the balance mask. * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. + * Also see should_we_balance(). */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +int group_balance_cpu(struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + return cpumask_first(group_balance_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group balance mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + +/* + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). + */ +static void +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +{ + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + cpumask_clear(mask); + + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) + continue; + + cpumask_set_cpu(i, mask); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(mask)); } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_span(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg) +{ + struct cpumask *mask = sched_domains_tmpmask2; + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + int cpu; + + build_balance_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + cpumask_copy(group_balance_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_span(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) @@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; @@ -589,23 +759,106 @@ fail: return -ENOMEM; } -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_span(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } @@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) @@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; @@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b8c84c6dee64..17f11c6b0a9f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -12,44 +12,44 @@ #include <linux/hash.h> #include <linux/kthread.h> -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); + spin_lock_init(&wq_head->lock); + lockdep_set_class_and_name(&wq_head->lock, key, name); + INIT_LIST_HEAD(&wq_head->head); } EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __remove_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { - wait_queue_t *curr, *next; + wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, /** * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function @@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(q, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets @@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!q)) + if (unlikely(!wq_head)) return; if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) { - __wake_up_sync_key(q, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -170,48 +170,48 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_t *wait, int flags) +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { - wait->flags = flags; - wait->private = current; - wait->func = autoremove_wake_function; - INIT_LIST_HEAD(&wait->task_list); + wq_entry->flags = flags; + wq_entry->private = current; + wq_entry->func = autoremove_wake_function; + INIT_LIST_HEAD(&wq_entry->entry); } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* * Exclusive waiter must not fail if it was selected by wakeup, @@ -219,24 +219,24 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) * * The caller will recheck the condition and return success if * we were already woken up, we can not miss the event because - * wakeup locks/unlocks the same q->lock. + * wakeup locks/unlocks the same wq_head->lock. * * But we need to ensure that set-condition + wakeup after that * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wait->task_list); + list_del_init(&wq_entry->entry); ret = -ERESTARTSYS; } else { - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); + if (list_empty(&wq_entry->entry)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(wq_head, wq_entry); else - __add_wait_queue(q, wait); + __add_wait_queue(wq_head, wq_entry); } set_current_state(state); } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); return ret; } @@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event); * condition in the caller before they add the wait * entry to the wake queue. */ -int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + if (likely(list_empty(&wait->entry))) + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) } EXPORT_SYMBOL(do_wait_intr); -int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + if (likely(list_empty(&wait->entry))) + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor + * @wq_head: waitqueue waited on + * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); + if (!list_empty_careful(&wq_entry->entry)) { + spin_lock_irqsave(&wq_head->lock, flags); + list_del_init(&wq_entry->entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - int ret = default_wake_function(wait, mode, sync, key); + int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wait->task_list); + list_del_init(&wq_entry->entry); return ret; } EXPORT_SYMBOL(autoremove_wake_function); @@ -334,24 +334,24 @@ static inline bool is_kthread_should_stop(void) /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * - * add_wait_queue(&wq, &wait); + * add_wait_queue(&wq_head, &wait); * for (;;) { * if (condition) * break; * * p->state = mode; condition = true; * smp_mb(); // A smp_wmb(); // C - * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; * schedule() try_to_wake_up(); * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; * smp_mb() // B smp_wmb(); // C - * wait->flags |= WQ_FLAG_WOKEN; + * wq_entry->flags |= WQ_FLAG_WOKEN; * } - * remove_wait_queue(&wq, &wait); + * remove_wait_queue(&wq_head, &wait); * */ -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -359,7 +359,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must * also observe all state before the wakeup. */ - if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); @@ -369,13 +369,13 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -385,267 +385,8 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ - wait->flags |= WQ_FLAG_WOKEN; + wq_entry->flags |= WQ_FLAG_WOKEN; - return default_wake_function(wait, mode, sync, key); + return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key, mode); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, - unsigned mode, unsigned long timeout) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - for (;;) { - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - ret = action(&q->key, mode); - /* - * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wq->lock, - * but test_and_set_bit() implies mb() which pairs with - * smp_mb__after_atomic() before wake_up_page(). - */ - if (ret) - finish_wait(wq, &q->wait); - } - if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { - if (!ret) - finish_wait(wq, &q->wait); - return 0; - } else if (ret) { - return ret; - } - } -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - val = q->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - wait_queue_head_t *wq = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); - - return __wait_on_atomic_t(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); - -__sched int bit_wait(struct wait_bit_key *word, int mode) -{ - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait); - -__sched int bit_wait_io(struct wait_bit_key *word, int mode) -{ - io_schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait_io); - -__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_timeout); - -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c new file mode 100644 index 000000000000..f8159698aa4d --- /dev/null +++ b/kernel/sched/wait_bit.c @@ -0,0 +1,286 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include <linux/wait_bit.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> +#include <linux/hash.h> + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wq_entry, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + +int __sched +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + for (;;) { + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wwq_head->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + } + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + if (!ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + return 0; + } else if (ret) { + return ret; + } + } +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wq_entry = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); + + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word, int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word, int mode) +{ + io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} diff --git a/kernel/signal.c b/kernel/signal.c index 9b28954e058f..35a570f71f07 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -39,6 +39,7 @@ #include <linux/compat.h> #include <linux/cn_proc.h> #include <linux/compiler.h> +#include <linux/posix-timers.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -637,7 +638,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * about to disable them again anyway. */ spin_unlock(&tsk->sighand->siglock); - do_schedule_next_timer(info); + posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); } #endif diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp .active_cpus = cpus, }; + lockdep_assert_cpus_held(); + if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) int ret; /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); + cpus_read_lock(); + ret = stop_machine_cpuslocked(fn, data, cpus); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. -config NO_HZ_FULL_SYSIDLE - bool "Detect full-system idle state for full dynticks system" - depends on NO_HZ_FULL - default n - help - At least one CPU must keep the scheduling-clock tick running for - timekeeping purposes whenever there is a non-idle CPU, where - "non-idle" also includes dynticks CPUs as long as they are - running non-idle tasks. Because the underlying adaptive-tick - support cannot distinguish between all CPUs being idle and - all CPUs each running a single task in dynticks mode, the - underlying support simply ensures that there is always a CPU - handling the scheduling-clock tick, whether or not all CPUs - are idle. This Kconfig option enables scalable detection of - the all-CPUs-idle state, thus allowing the scheduling-clock - tick to be disabled when all CPUs are idle. Note that scalable - detection of the all-CPUs-idle state means that larger systems - will be slower to declare the all-CPUs-idle state. - - Say Y if you would like to help debug all-CPUs-idle detection. - - Say N if you are unsure. - -config NO_HZ_FULL_SYSIDLE_SMALL - int "Number of CPUs above which large-system approach is used" - depends on NO_HZ_FULL_SYSIDLE - range 1 NR_CPUS - default 8 - help - The full-system idle detection mechanism takes a lazy approach - on large systems, as is required to attain decent scalability. - However, on smaller systems, scalability is not anywhere near as - large a concern as is energy efficiency. The sysidle subsystem - therefore uses a fast but non-scalable algorithm for small - systems and a lazier but scalable algorithm for large systems. - This Kconfig parameter defines the number of CPUs in the largest - system that will be considered to be "small". - - The default value will be fine in most cases. Battery-powered - systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger - numbers of CPUs, and (3) are suffering from battery-lifetime - problems due to long sysidle latencies might wish to experiment - with larger values for this Kconfig parameter. On the other - hand, they might be even better served by disabling NO_HZ_FULL - entirely, given that NO_HZ_FULL is intended for HPC and - real-time workloads that at present do not tend to be run on - battery-powered systems. - - Take the default if you are unsure. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ee2f4202d82a..c991cf212c6d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -27,6 +27,9 @@ #include <linux/posix-timers.h> #include <linux/workqueue.h> #include <linux/freezer.h> +#include <linux/compat.h> + +#include "posix-timers.h" #define CREATE_TRACE_POINTS #include <trace/events/alarmtimer.h> @@ -45,11 +48,13 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) /* freezer information to handle clock_nanosleep triggered wakeups */ static enum alarmtimer_type freezer_alarmtype; static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +#endif static struct wakeup_source *ws; @@ -307,38 +312,6 @@ static int alarmtimer_resume(struct device *dev) } #endif -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) -{ - struct alarm_base *base; - unsigned long flags; - ktime_t delta; - - switch(type) { - case ALARM_REALTIME: - base = &alarm_bases[ALARM_REALTIME]; - type = ALARM_REALTIME_FREEZER; - break; - case ALARM_BOOTTIME: - base = &alarm_bases[ALARM_BOOTTIME]; - type = ALARM_BOOTTIME_FREEZER; - break; - default: - WARN_ONCE(1, "Invalid alarm type: %d\n", type); - return; - } - - delta = ktime_sub(absexp, base->gettime()); - - spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta || (delta < freezer_delta)) { - freezer_delta = delta; - freezer_expires = absexp; - freezer_alarmtype = type; - } - spin_unlock_irqrestore(&freezer_delta_lock, flags); -} - - /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -488,6 +461,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward_now); +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} /** * clock2alarm - helper that converts from clockid to alarmtypes @@ -511,22 +516,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { - unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); + it.alarm.alarmtimer); enum alarmtimer_restart result = ALARMTIMER_NORESTART; + unsigned long flags; + int si_private = 0; spin_lock_irqsave(&ptr->it_lock, flags); - if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (IS_ENABLED(CONFIG_POSIX_TIMERS) && - posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; - } - /* Re-add periodic timers */ - if (ptr->it.alarm.interval) { - ptr->it_overrun += alarm_forward(alarm, now, - ptr->it.alarm.interval); + ptr->it_active = 0; + if (ptr->it_interval) + si_private = ++ptr->it_requeue_pending; + + if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + /* + * Handle ignored signals and rearm the timer. This will go + * away once we handle ignored signals proper. + */ + ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ++ptr->it_requeue_pending; + ptr->it_active = 1; result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); @@ -535,6 +544,72 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, } /** + * alarm_timer_rearm - Posix timer callback for rearming timer + * @timr: Pointer to the posixtimer data struct + */ +static void alarm_timer_rearm(struct k_itimer *timr) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); + alarm_start(alarm, alarm->node.expires); +} + +/** + * alarm_timer_forward - Posix timer callback for forwarding timer + * @timr: Pointer to the posixtimer data struct + * @now: Current time to forward the timer against + */ +static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return (int) alarm_forward(alarm, timr->it_interval, now); +} + +/** + * alarm_timer_remaining - Posix timer callback to retrieve remaining time + * @timr: Pointer to the posixtimer data struct + * @now: Current time to calculate against + */ +static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return ktime_sub(now, alarm->node.expires); +} + +/** + * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer + * @timr: Pointer to the posixtimer data struct + */ +static int alarm_timer_try_to_cancel(struct k_itimer *timr) +{ + return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); +} + +/** + * alarm_timer_arm - Posix timer callback to arm a timer + * @timr: Pointer to the posixtimer data struct + * @expires: The new expiry time + * @absolute: Expiry value is absolute time + * @sigev_none: Posix timer does not deliver signals + */ +static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + struct alarm_base *base = &alarm_bases[alarm->type]; + + if (!absolute) + expires = ktime_add_safe(expires, base->gettime()); + if (sigev_none) + alarm->node.expires = expires; + else + alarm_start(&timr->it.alarm.alarmtimer, expires); +} + +/** * alarm_clock_getres - posix getres interface * @which_clock: clockid * @tp: timespec to fill @@ -591,97 +666,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) } /** - * alarm_timer_get - posix timer_get interface - * @new_timer: k_itimer pointer - * @cur_setting: itimerspec data to fill - * - * Copies out the current itimerspec data - */ -static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec64 *cur_setting) -{ - ktime_t relative_expiry_time = - alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - - if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); - } else { - cur_setting->it_value.tv_sec = 0; - cur_setting->it_value.tv_nsec = 0; - } - - cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); -} - -/** - * alarm_timer_del - posix timer_del interface - * @timr: k_itimer pointer to be deleted - * - * Cancels any programmed alarms for the given timer. - */ -static int alarm_timer_del(struct k_itimer *timr) -{ - if (!rtcdev) - return -ENOTSUPP; - - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - return 0; -} - -/** - * alarm_timer_set - posix timer_set interface - * @timr: k_itimer pointer to be deleted - * @flags: timer flags - * @new_setting: itimerspec to be used - * @old_setting: itimerspec being replaced - * - * Sets the timer to new_setting, and starts the timer. - */ -static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) -{ - ktime_t exp; - - if (!rtcdev) - return -ENOTSUPP; - - if (flags & ~TIMER_ABSTIME) - return -EINVAL; - - if (old_setting) - alarm_timer_get(timr, old_setting); - - /* If the timer was already set, cancel it */ - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - /* start the timer */ - timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); - - /* - * Rate limit to the tick as a hot fix to prevent DOS. Will be - * mopped up later. - */ - if (timr->it.alarm.interval < TICK_NSEC) - timr->it.alarm.interval = TICK_NSEC; - - exp = timespec64_to_ktime(new_setting->it_value); - /* Convert (if necessary) to absolute time */ - if (flags != TIMER_ABSTIME) { - ktime_t now; - - now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add_safe(now, exp); - } - - alarm_start(&timr->it.alarm.alarmtimer, exp); - return 0; -} - -/** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired * @@ -705,8 +689,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, * * Sets the alarm timer and sleeps until it is fired or interrupted. */ -static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, + enum alarmtimer_type type) { + struct restart_block *restart; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); @@ -719,36 +705,25 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) __set_current_state(TASK_RUNNING); - return (alarm->data == NULL); -} - - -/** - * update_rmtp - Update remaining timespec value - * @exp: expiration time - * @type: timer type - * @rmtp: user pointer to remaining timepsec value - * - * Helper function that fills in rmtp value with time between - * now and the exp value - */ -static int update_rmtp(ktime_t exp, enum alarmtimer_type type, - struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = ktime_sub(exp, alarm_bases[type].gettime()); - - if (rem <= 0) + if (!alarm->data) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; + if (freezing(current)) + alarmtimer_freezerset(absexp, type); + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(absexp, alarm_bases[type].gettime()); - return 1; + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; } /** @@ -760,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { enum alarmtimer_type type = restart->nanosleep.clockid; - ktime_t exp; - struct timespec __user *rmtp; + ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - int ret = 0; - exp = restart->nanosleep.expires; alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - return ret; + return alarmtimer_do_nsleep(&alarm, exp, type); } /** @@ -798,11 +753,10 @@ out: * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsreq, - struct timespec __user *rmtp) + const struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); - struct restart_block *restart; + struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; int ret = 0; @@ -825,35 +779,36 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = ktime_add(now, exp); } - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); + ret = alarmtimer_do_nsleep(&alarm, exp, type); + if (ret != -ERESTART_RESTARTBLOCK) + return ret; /* abs timers don't set remaining time or restart */ - if (flags == TIMER_ABSTIME) { - ret = -ERESTARTNOHAND; - goto out; - } + if (flags == TIMER_ABSTIME) + return -ERESTARTNOHAND; - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; - restart->nanosleep.rmtp = rmtp; - ret = -ERESTART_RESTARTBLOCK; - -out: return ret; } +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = common_timer_set, + .timer_del = common_timer_del, + .timer_get = common_timer_get, + .timer_arm = alarm_timer_arm, + .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, + .timer_remaining = alarm_timer_remaining, + .timer_try_to_cancel = alarm_timer_try_to_cancel, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { @@ -879,23 +834,9 @@ static int __init alarmtimer_init(void) struct platform_device *pdev; int error = 0; int i; - struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .nsleep = alarm_timer_nsleep, - }; alarmtimer_rtc_timer_init(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); - } - /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ac053bb5296e..81da124f1115 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -51,6 +51,7 @@ #include <linux/sched/debug.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -1439,8 +1440,29 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +{ + switch(restart->nanosleep.type) { +#ifdef CONFIG_COMPAT + case TT_COMPAT: + if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + return -EFAULT; + break; +#endif + case TT_NATIVE: + if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + return -EFAULT; + break; + default: + BUG(); + } + return -ERESTART_RESTARTBLOCK; +} + static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { + struct restart_block *restart; + hrtimer_init_sleeper(t, current); do { @@ -1457,53 +1479,38 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod __set_current_state(TASK_RUNNING); - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = hrtimer_expires_remaining(timer); - if (rem <= 0) + if (!t->task) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + ktime_t rem = hrtimer_expires_remaining(&t->timer); + struct timespec rmt; - return 1; + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; } -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; + int ret; hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: + ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } -long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; @@ -1517,7 +1524,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, hrtimer_init_on_stack(&t.timer, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode)) + ret = do_nanosleep(&t, mode); + if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ @@ -1526,19 +1534,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, goto out; } - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); - - ret = -ERESTART_RESTARTBLOCK; out: destroy_hrtimer_on_stack(&t.timer); return ret; @@ -1557,8 +1556,31 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} + +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 tu64; + struct timespec tu; + + if (compat_get_timespec(&tu, rqtp)) + return -EFAULT; + + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) + return -EINVAL; + + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } +#endif /* * Functions related to boot-time initialization: diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 087d6a1279b8..2ef98a02376a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -15,6 +15,7 @@ #include <linux/posix-timers.h> #include <linux/hrtimer.h> #include <trace/events/timer.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) +{ + struct itimerval kit; + int error = do_getitimer(which, &kit); + + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} +#endif + /* * The timer is automagically restarted, when interval != 0 @@ -138,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_ns(&value->it_interval); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + nval = ktime_to_ns(timeval_to_ktime(value->it_value)); + ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); spin_lock_irq(&tsk->sighand->siglock); @@ -294,3 +312,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return -EFAULT; return 0; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else { + memset(&kin, 0, sizeof(kin)); + } + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} +#endif diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 31d588d37a17..17cdc554c9fe 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -25,6 +25,8 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> +#include "posix-timers.h" + static void delete_clock(struct kref *kref); /* @@ -82,38 +84,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) return result; } -static int posix_clock_fasync(int fd, struct file *fp, int on) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = 0; - - if (!clk) - return -ENODEV; - - if (clk->ops.fasync) - err = clk->ops.fasync(clk, fd, fp, on); - - put_posix_clock(clk); - - return err; -} - -static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENODEV; - - if (!clk) - return -ENODEV; - - if (clk->ops.mmap) - err = clk->ops.mmap(clk, vma); - - put_posix_clock(clk); - - return err; -} - static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { @@ -199,8 +169,6 @@ static const struct file_operations posix_clock_file_operations = { .unlocked_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, - .fasync = posix_clock_fasync, - .mmap = posix_clock_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = posix_clock_compat_ioctl, #endif @@ -359,88 +327,9 @@ out: return err; } -static int pc_timer_create(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_create) - err = cd.clk->ops.timer_create(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_timer_delete(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_delete) - err = cd.clk->ops.timer_delete(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - - if (get_clock_desc(id, &cd)) - return; - - if (cd.clk->ops.timer_gettime) - cd.clk->ops.timer_gettime(cd.clk, kit, ts); - - put_clock_desc(&cd); -} - -static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec64 *ts, struct itimerspec64 *old) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_settime) - err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -struct k_clock clock_posix_dynamic = { +const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, .clock_adj = pc_clock_adjtime, - .timer_create = pc_timer_create, - .timer_set = pc_timer_settime, - .timer_del = pc_timer_delete, - .timer_get = pc_timer_gettime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d2a1e6dd0291..60cb24ac9ebc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -12,6 +12,11 @@ #include <trace/events/timer.h> #include <linux/tick.h> #include <linux/workqueue.h> +#include <linux/compat.h> + +#include "posix-timers.h" + +static void posix_cpu_timer_rearm(struct k_itimer *timer); /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -322,6 +327,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) return -EINVAL; + new_timer->kclock = &clock_posix_cpu; + INIT_LIST_HEAD(&new_timer->it.cpu.entry); rcu_read_lock(); @@ -524,7 +531,8 @@ static void cpu_timer_fire(struct k_itimer *timer) * reload the timer. But we need to keep it * ticking in case the signal is deliverable next time. */ - posix_cpu_timer_schedule(timer); + posix_cpu_timer_rearm(timer); + ++timer->it_requeue_pending; } } @@ -572,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, WARN_ON_ONCE(p == NULL); - new_expires = timespec64_to_ns(&new->it_value); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers @@ -712,10 +724,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp */ itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); - if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + if (!timer->it.cpu.expires) return; - } /* * Sample the clock to take the difference with the expiry time. @@ -739,7 +749,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); @@ -976,10 +985,10 @@ static void check_process_timers(struct task_struct *tsk, } /* - * This is called from the signal code (via do_schedule_next_timer) + * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. */ -void posix_cpu_timer_schedule(struct k_itimer *timer) +static void posix_cpu_timer_rearm(struct k_itimer *timer) { struct sighand_struct *sighand; unsigned long flags; @@ -995,12 +1004,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) cpu_clock_sample(timer->it_clock, p, &now); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) - goto out; + return; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (!sighand) - goto out; + return; } else { /* * Protect arm_timer() and timer sampling in case of call to @@ -1013,11 +1022,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * We can't even collect a sample any more. */ timer->it.cpu.expires = 0; - goto out; + return; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - unlock_task_sighand(p, &flags); - /* Optimizations: if the process is dying, no need to rearm */ - goto out; + /* If the process is dying, no need to rearm */ + goto unlock; } cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1029,12 +1037,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ WARN_ON_ONCE(!irqs_disabled()); arm_timer(timer); +unlock: unlock_task_sighand(p, &flags); - -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; } /** @@ -1227,9 +1231,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct itimerspec64 *it) + const struct timespec64 *rqtp) { + struct itimerspec64 it; struct k_itimer timer; + u64 expires; int error; /* @@ -1243,12 +1249,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec64 zero_it; + struct restart_block *restart; - memset(it, 0, sizeof *it); - it->it_value = *rqtp; + memset(&it, 0, sizeof(it)); + it.it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1277,8 +1284,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - *rqtp = ns_to_timespec64(timer.it.cpu.expires); - error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + expires = timer.it.cpu.expires; + error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* * Timer is now unarmed, deletion can not fail. @@ -1298,7 +1305,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_unlock_irq(&timer.it_lock); } - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* * It actually did fire already. */ @@ -1306,6 +1313,17 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } error = -ERESTART_RESTARTBLOCK; + /* + * Report back to the user the time still remaining. + */ + restart = ¤t->restart_block; + restart->nanosleep.expires = expires; + if (restart->nanosleep.type != TT_NONE) { + struct timespec ts; + + ts = timespec64_to_timespec(it.it_value); + error = nanosleep_copyout(restart, &ts); + } } return error; @@ -1314,11 +1332,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct timespec __user *rmtp) + const struct timespec64 *rqtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec64 it; - struct timespec ts; int error; /* @@ -1329,23 +1345,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) return -EINVAL; - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + error = do_cpu_nanosleep(which_clock, flags, rqtp); if (error == -ERESTART_RESTARTBLOCK) { if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - ts = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) - return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; - restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1353,28 +1361,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct itimerspec64 it; struct timespec64 t; - struct timespec tmp; - int error; t = ns_to_timespec64(restart_block->nanosleep.expires); - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - struct timespec __user *rmtp = restart_block->nanosleep.rmtp; - /* - * Report back to the user the time still remaining. - */ - tmp = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) - return -EFAULT; - - restart_block->nanosleep.expires = timespec64_to_ns(&t); - } - return error; - + return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) @@ -1396,14 +1387,9 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, - struct timespec __user *rmtp) -{ - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); -} -static long process_cpu_nsleep_restart(struct restart_block *restart_block) + const struct timespec64 *rqtp) { - return -EINVAL; + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) @@ -1421,36 +1407,27 @@ static int thread_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } -struct k_clock clock_posix_cpu = { +const struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, .clock_set = posix_cpu_clock_set, .clock_get = posix_cpu_clock_get, .timer_create = posix_cpu_timer_create, .nsleep = posix_cpu_nsleep, - .nsleep_restart = posix_cpu_nsleep_restart, .timer_set = posix_cpu_timer_set, .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, +}; - return 0; -} -__initcall(init_posix_cpu_timers); +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index c0cd53eb018a..38f3b20efa29 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,6 +17,7 @@ #include <linux/ktime.h> #include <linux/timekeeping.h> #include <linux/posix-timers.h> +#include <linux/compat.h> asmlinkage long sys_ni_posix_timers(void) { @@ -27,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void) } #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) SYS_NI(timer_create); SYS_NI(timer_gettime); @@ -39,6 +41,12 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif +COMPAT_SYS_NI(timer_create); +COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC @@ -110,22 +118,106 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) - return -EINVAL; - return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); + break; default: return -EINVAL; } + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #ifdef CONFIG_COMPAT -long clock_nanosleep_restart(struct restart_block *restart_block) +COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec64 new_tp64; + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct compat_timespec __user *,tp) { - return hrtimer_nanosleep_restart(restart_block); + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; + default: return -EINVAL; + } + + kernel_tp = timespec64_to_timespec(kernel_tp64); + if (compat_put_timespec(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (compat_put_timespec(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 t64; + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4d7b2ce09c27..82d67be7d9d1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,8 +49,10 @@ #include <linux/workqueue.h> #include <linux/export.h> #include <linux/hashtable.h> +#include <linux/compat.h> #include "timekeeping.h" +#include "posix-timers.h" /* * Management arrays for POSIX timers. Timers are now kept in static hash table @@ -69,6 +71,10 @@ static struct kmem_cache *posix_timers_cache; static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); +static const struct k_clock * const posix_clocks[]; +static const struct k_clock *clockid_to_kclock(const clockid_t id); +static const struct k_clock clock_realtime, clock_monotonic; + /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. @@ -124,22 +130,6 @@ static DEFINE_SPINLOCK(hash_lock); * have is CLOCK_REALTIME and its high res counter part, both of * which we beg off on and pass to do_sys_settimeofday(). */ - -static struct k_clock posix_clocks[MAX_CLOCKS]; - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, - struct timespec __user *rmtp); -static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec64 *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec64 *, struct itimerspec64 *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -285,91 +275,23 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) */ static __init int init_posix_timers(void) { - struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - posix_timers_register_clock(CLOCK_TAI, &clock_tai); - posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); return 0; } - __initcall(init_posix_timers); -static void schedule_next_timer(struct k_itimer *timr) +static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - if (timr->it.real.interval == 0) + if (!timr->it_interval) return; timr->it_overrun += (unsigned int) hrtimer_forward(timer, timer->base->get_time(), - timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; + timr->it_interval); hrtimer_restart(timer); } @@ -384,24 +306,27 @@ static void schedule_next_timer(struct k_itimer *timr) * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ -void do_schedule_next_timer(struct siginfo *info) +void posixtimer_rearm(struct siginfo *info) { struct k_itimer *timr; unsigned long flags; timr = lock_timer(info->si_tid, &flags); + if (!timr) + return; - if (timr && timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); + if (timr->it_requeue_pending == info->si_sys_private) { + timr->kclock->timer_rearm(timr); + + timr->it_active = 1; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; info->si_overrun += timr->it_overrun_last; } - if (timr) - unlock_timer(timr, flags); + unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr, int si_private) @@ -410,12 +335,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private) int shared, ret = -1; /* * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). + * dequeue_signal()->posixtimer_rearm(). * * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). + * si_sys_private it calls posixtimer_rearm(). * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer + * posixtimer_rearm() locks the timer * and re-schedules it while ->sigq is pending. * Not really bad, but not that we want. */ @@ -431,7 +356,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) /* If we failed to send the signal the timer stops. */ return ret > 0; } -EXPORT_SYMBOL_GPL(posix_timer_event); /* * This function gets called when a POSIX.1b interval timer expires. It @@ -450,7 +374,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - if (timr->it.real.interval != 0) + timr->it_active = 0; + if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { @@ -459,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * we will not get a call back to restart it AND * it should be restarted. */ - if (timr->it.real.interval != 0) { + if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* @@ -488,15 +413,16 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { ktime_t kj = NSEC_PER_SEC / HZ; - if (timr->it.real.interval < kj) + if (timr->it_interval < kj) now = ktime_add(now, kj); } #endif timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, - timr->it.real.interval); + timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; + timr->it_active = 1; } } @@ -521,30 +447,6 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; @@ -581,17 +483,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static struct k_clock *clockid_to_kclock(const clockid_t id) -{ - if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? - &clock_posix_dynamic : &clock_posix_cpu; - - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) - return NULL; - return &posix_clocks[id]; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -599,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer) } /* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) +static int do_timer_create(clockid_t which_clock, struct sigevent *event, + timer_t __user *created_timer_id) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - sigevent_t event; int it_id_set = IT_ID_NOT_SET; if (!kc) @@ -629,31 +517,28 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; + new_timer->kclock = kc; new_timer->it_overrun = -1; - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } + if (event) { rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(&event)); + new_timer->it_pid = get_pid(good_sigevent(event)); rcu_read_unlock(); if (!new_timer->it_pid) { error = -EINVAL; goto out; } + new_timer->it_sigev_notify = event->sigev_notify; + new_timer->sigq->info.si_signo = event->sigev_signo; + new_timer->sigq->info.si_value = event->sigev_value; } else { - memset(&event.sigev_value, 0, sizeof(event.sigev_value)); - event.sigev_notify = SIGEV_SIGNAL; - event.sigev_signo = SIGALRM; - event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->sigq->info.si_signo = SIGALRM; + memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq->info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->sigq->info.si_signo = event.sigev_signo; - new_timer->sigq->info.si_value = event.sigev_value; new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; @@ -684,6 +569,36 @@ out: return error; } +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (copy_from_user(&event, timer_event_spec, sizeof (event))) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (get_compat_sigevent(&event, timer_event_spec)) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} +#endif + /* * Locking issues: We need to protect the result of the id look up until * we get the timer locked down so it is not deleted under us. The @@ -717,6 +632,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; } +static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return __hrtimer_expires_remaining_adjusted(timer, now); +} + +static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return (int)hrtimer_forward(timer, now, timr->it_interval); +} + /* * Get the time remaining on a POSIX.1b interval timer. This function * is ALWAYS called with spin_lock_irq on the timer, thus it must not @@ -733,55 +662,61 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * it is the same as a requeue pending timer WRT to what we should * report. */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { + const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; + struct timespec64 ts64; + bool sig_none; - memset(cur_setting, 0, sizeof(*cur_setting)); - - iv = timr->it.real.interval; + sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + iv = timr->it_interval; /* interval timer ? */ - if (iv) + if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; + } else if (!timr->it_active) { + /* + * SIGEV_NONE oneshot timers are never queued. Check them + * below. + */ + if (!sig_none) + return; + } - now = timer->base->get_time(); + /* + * The timespec64 based conversion is suboptimal, but it's not + * worth to implement yet another callback. + */ + kc->clock_get(timr->it_clock, &ts64); + now = timespec64_to_ktime(ts64); /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. + * When a requeue is pending or this is a SIGEV_NONE timer move the + * expiry time forward by intervals, so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + timr->it_overrun += kc->timer_forward(timr, now); - remaining = __hrtimer_expires_remaining_adjusted(timer, now); + remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when * it is expired ! */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + if (!sig_none) cur_setting->it_value.tv_nsec = 1; - } else + } else { cur_setting->it_value = ktime_to_timespec64(remaining); + } } /* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) +static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct itimerspec64 cur_setting64; - struct itimerspec cur_setting; struct k_itimer *timr; - struct k_clock *kc; + const struct k_clock *kc; unsigned long flags; int ret = 0; @@ -789,20 +724,49 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + memset(setting, 0, sizeof(*setting)); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting64); + kc->timer_get(timr, setting); unlock_timer(timr, flags); + return ret; +} - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (put_compat_itimerspec(setting, &cur_setting)) + ret = -EFAULT; + } return ret; } +#endif /* * Get the number of overruns of a POSIX.1b interval timer. This is to @@ -810,7 +774,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, * accumulating overruns on the next timer. The overrun is frozen when * the signal is delivered, either at the notify time (if the info block * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is + * the call back to posixtimer_rearm(). So all we need to do is * to pick up the frozen overrun. */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) @@ -829,117 +793,183 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return overrun; } -/* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) +static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; + mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + /* + * Posix magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they become CLOCK_MONOTONIC based under the + * hood. See hrtimer_init(). Update timr->kclock, so the generic + * functions which use timr->kclock->clock_get() work. + * + * Note: it_clock stays unmodified, because the next timer_set() might + * use ABSTIME, so it needs to switch back. + */ + if (timr->it_clock == CLOCK_REALTIME) + timr->kclock = absolute ? &clock_realtime : &clock_monotonic; + + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + if (!absolute) + expires = ktime_add_safe(expires, timer->base->get_time()); + hrtimer_set_expires(timer, expires); + + if (!sigev_none) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +{ + return hrtimer_try_to_cancel(&timr->it.real.timer); +} + +/* Set a POSIX.1b interval timer. */ +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) +{ + const struct k_clock *kc = timr->kclock; + bool sigev_none; + ktime_t expires; + if (old_setting) common_timer_get(timr, old_setting); - /* disable the timer */ - timr->it.real.interval = 0; + /* Prevent rearming by clearing the interval */ + timr->it_interval = 0; /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. + * Careful here. On SMP systems the timer expiry function could be + * active and spinning on timr->it_lock. */ - if (hrtimer_try_to_cancel(timer) < 0) + if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + timr->it_active = 0; + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; - /* switch off the timer when it_value is zero */ + /* Switch off the timer when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); - - /* Convert interval */ - timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); - - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - hrtimer_add_expires(timer, timer->base->get_time()); - } - return 0; - } + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); + expires = timespec64_to_ktime(new_setting->it_value); + sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; - hrtimer_start_expires(timer, mode); + kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); + timr->it_active = !sigev_none; return 0; } -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) +static int do_timer_settime(timer_t timer_id, int flags, + struct itimerspec64 *new_spec64, + struct itimerspec64 *old_spec64) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec, old_spec; + const struct k_clock *kc; struct k_itimer *timr; unsigned long flag; - struct k_clock *kc; int error = 0; - if (!new_setting) + if (!timespec64_valid(&new_spec64->it_interval) || + !timespec64_valid(&new_spec64->it_value)) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - - if (!timespec64_valid(&new_spec64.it_interval) || - !timespec64_valid(&new_spec64.it_value)) - return -EINVAL; + if (old_spec64) + memset(old_spec64, 0, sizeof(*old_spec64)); retry: timr = lock_timer(timer_id, &flag); if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec64, rtn); + error = kc->timer_set(timr, flags, new_spec64, old_spec64); unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + old_spec64 = NULL; // We already got the old time... goto retry; } - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; + return error; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new_setting) + return -EINVAL; + + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old_setting) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + } + return error; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec(&new_spec, new)) + return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (put_compat_itimerspec(old, &old_spec)) + error = -EFAULT; + } return error; } +#endif -static int common_timer_del(struct k_itimer *timer) +int common_timer_del(struct k_itimer *timer) { - timer->it.real.interval = 0; + const struct k_clock *kc = timer->kclock; - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + timer->it_interval = 0; + if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; + timer->it_active = 0; return 0; } static inline int timer_delete_hook(struct k_itimer *timer) { - struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = timer->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; @@ -1018,7 +1048,7 @@ void exit_itimers(struct signal_struct *sig) SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp64; struct timespec new_tp; @@ -1035,7 +1065,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; @@ -1055,7 +1085,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct timex __user *, utx) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timex ktx; int err; @@ -1078,7 +1108,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; @@ -1095,13 +1125,98 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, return error; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + + return kc->clock_set(which_clock, &new_tp64); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); + + if (!error && compat_put_timespec(&kernel_tp, tp)) + error = -EFAULT; + + return error; +} + +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + err = compat_get_timex(&ktx, utp); + if (err) + return err; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0) + err = compat_put_timex(utp, &ktx); + + return err; +} + +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); + + if (!error && tp && compat_put_timespec(&rtn_tp, tp)) + error = -EFAULT; + + return error; +} +#endif + /* * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsave, struct timespec __user *rmtp) + const struct timespec64 *rqtp) { - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -1110,7 +1225,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t64; struct timespec t; @@ -1125,21 +1240,141 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, t64 = timespec_to_timespec64(t); if (!timespec64_valid(&t64)) return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64, rmtp); + return kc->nsleep(which_clock, flags, &t64); } -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long clock_nanosleep_restart(struct restart_block *restart_block) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { - clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; + struct timespec t; - if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; - return kc->nsleep_restart(restart_block); + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; + + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t64); +} +#endif + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + +static const struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + return NULL; + return posix_clocks[id]; } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 000000000000..fb303c3be4d3 --- /dev/null +++ b/kernel/time/posix-timers.h @@ -0,0 +1,40 @@ +#define TIMER_RETRY 1 + +struct k_clock { + int (*clock_getres)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_set)(const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*timer_create)(struct k_itimer *timer); + int (*nsleep)(const clockid_t which_clock, int flags, + const struct timespec64 *); + int (*timer_set)(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del)(struct k_itimer *timr); + void (*timer_get)(struct k_itimer *timr, + struct itimerspec64 *cur_setting); + void (*timer_rearm)(struct k_itimer *timr); + int (*timer_forward)(struct k_itimer *timr, ktime_t now); + ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); + int (*timer_try_to_cancel)(struct k_itimer *timr); + void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none); +}; + +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; + +int posix_timer_event(struct k_itimer *timr, int si_private); + +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); +int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..c7a899c5ce64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); @@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, @@ -701,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { - tick = 0; - /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): @@ -712,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ - if (!ts->tick_stopped) - goto out; - - /* - * If, OTOH, we did stop it, but there's a pending (expired) - * timer reprogram the timer hardware to fire now. - * - * We will not restart the tick proper, just prod the timer - * hardware into firing an interrupt to process the pending - * timers. Just like tick_irq_exit() will not restart the tick - * for 'normal' interrupts. - * - * Only once we exit the idle loop will we re-enable the tick, - * see tick_nohz_idle_exit(). - */ - if (delta == 0) { - tick_nohz_restart(ts, now); + if (!ts->tick_stopped) { + tick = 0; goto out; } } @@ -771,8 +766,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) - goto out; + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + goto out; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + } /* * nohz_stop_sched_tick can be called several times before @@ -782,8 +785,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + calc_load_nohz_start(); cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -791,6 +793,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -801,12 +805,17 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, goto out; } + hrtimer_set_expires(&ts->sched_timer, tick); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -823,7 +832,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) */ timer_clear_idle(); - calc_load_exit_idle(); + calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick @@ -864,6 +873,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; return false; } @@ -923,8 +937,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_expires = expires; } - if (!was_stopped && ts->tick_stopped) + if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } } } @@ -1172,6 +1188,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (regs) tick_sched_handle(ts, regs); + else + ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/kernel/time/time.c b/kernel/time/time.c index 49c73c6ed648..7c89e437c4d7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -39,6 +39,7 @@ #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include <asm/unistd.h> #include <generated/timeconst.h> @@ -99,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ +#ifdef CONFIG_COMPAT +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +{ + struct timeval tv; + compat_time_t i; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif + SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { @@ -215,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + if (tv) { + struct timeval ktv; + + do_gettimeofday(&ktv); + if (compat_put_timeval(&ktv, tv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timeval user_tv; + struct timezone new_tz; + + if (tv) { + if (compat_get_timeval(&user_tv, tv)) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} +#endif + SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ @@ -224,12 +307,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) * structure. But bear in mind that the structures * may change */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +{ + struct timex txc; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} +#endif + /* * Convert jiffies to milliseconds and back. * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b602c48cb841..cedafa008de5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } } static inline struct timespec64 tk_xtime(struct timekeeper *tk) @@ -285,12 +289,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; - if (shift_change < 0) + if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; - else + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } } - tk->tkr_raw.xtime_nsec = 0; tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; @@ -510,6 +516,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) } #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon. static inline void update_vsyscall(struct timekeeper *tk) { @@ -619,9 +626,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); - /* Update the monotonic raw base */ - tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); - /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take @@ -631,6 +635,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + seconds = tk->raw_sec; + nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); + tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); } /* must hold timekeeper_lock */ @@ -672,7 +681,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; - u64 nsec; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -684,10 +692,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); - timespec64_add_ns(&tk->raw_time, nsec); + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); } /** @@ -1373,19 +1384,18 @@ int timekeeping_notify(struct clocksource *clock) void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; unsigned long seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); - ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(&ts64, nsecs); - *ts = ts64; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic64); @@ -1509,8 +1519,7 @@ void __init timekeeping_init(void) tk_setup_internals(tk, clock); tk_set_xtime(tk, &now); - tk->raw_time.tv_sec = 0; - tk->raw_time.tv_nsec = 0; + tk->raw_sec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -2011,15 +2020,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; - tk->raw_time.tv_sec++; + tk->raw_sec++; } - tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; - tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 152a706ef8b8..709a404bd133 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer); /** * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del + * @timer: timer to delete * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c74bf39ef764..a86688fabc55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work) EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct *work; }; -static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b7882d45f48e..ca9460f049b8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1052,6 +1052,7 @@ config DEBUG_LOCK_ALLOC depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select LOCKDEP help This feature will check whether any held lock (spinlock, rwlock, @@ -1067,6 +1068,7 @@ config PROVE_LOCKING select LOCKDEP select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC select TRACE_IRQFLAGS default n @@ -1121,6 +1123,7 @@ config LOCK_STAT select LOCKDEP select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC default n help @@ -1301,189 +1304,7 @@ config DEBUG_CREDENTIALS If unsure, say N. -menu "RCU Debugging" - -config PROVE_RCU - def_bool PROVE_LOCKING - -config PROVE_RCU_REPEATEDLY - bool "RCU debugging: don't disable PROVE_RCU on first splat" - depends on PROVE_RCU - default n - help - By itself, PROVE_RCU will disable checking upon issuing the - first warning (or "splat"). This feature prevents such - disabling, allowing multiple RCU-lockdep warnings to be printed - on a single reboot. - - Say Y to allow multiple RCU-lockdep warnings per boot. - - Say N if you are unsure. - -config SPARSE_RCU_POINTER - bool "RCU debugging: sparse-based checks for pointer usage" - default n - help - This feature enables the __rcu sparse annotation for - RCU-protected pointers. This annotation will cause sparse - to flag any non-RCU used of annotated pointers. This can be - helpful when debugging RCU usage. Please note that this feature - is not intended to enforce code cleanliness; it is instead merely - a debugging aid. - - Say Y to make sparse flag questionable use of RCU-protected pointers - - Say N if you are unsure. - -config TORTURE_TEST - tristate - default n - -config RCU_PERF_TEST - tristate "performance tests for RCU" - depends on DEBUG_KERNEL - select TORTURE_TEST - select SRCU - select TASKS_RCU - default n - help - This option provides a kernel module that runs performance - tests on the RCU infrastructure. The kernel module may be built - after the fact on the running kernel to be tested, if desired. - - Say Y here if you want RCU performance tests to be built into - the kernel. - Say M if you want the RCU performance tests to build as a module. - Say N if you are unsure. - -config RCU_TORTURE_TEST - tristate "torture tests for RCU" - depends on DEBUG_KERNEL - select TORTURE_TEST - select SRCU - select TASKS_RCU - default n - help - This option provides a kernel module that runs torture tests - on the RCU infrastructure. The kernel module may be built - after the fact on the running kernel to be tested, if desired. - - Say Y here if you want RCU torture tests to be built into - the kernel. - Say M if you want the RCU torture tests to build as a module. - Say N if you are unsure. - -config RCU_TORTURE_TEST_SLOW_PREINIT - bool "Slow down RCU grace-period pre-initialization to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period pre-initialization (the - propagation of CPU-hotplug changes up the rcu_node combining - tree) for a few jiffies between initializing each pair of - consecutive rcu_node structures. This helps to expose races - involving grace-period pre-initialization, in other words, it - makes your kernel less stable. It can also greatly increase - grace-period latency, especially on systems with large numbers - of CPUs. This is useful when torture-testing RCU, but in - almost no other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY - int "How much to slow down RCU grace-period pre-initialization" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_PREINIT - help - This option specifies the number of jiffies to wait between - each rcu_node structure pre-initialization step. - -config RCU_TORTURE_TEST_SLOW_INIT - bool "Slow down RCU grace-period initialization to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period initialization for a few - jiffies between initializing each pair of consecutive - rcu_node structures. This helps to expose races involving - grace-period initialization, in other words, it makes your - kernel less stable. It can also greatly increase grace-period - latency, especially on systems with large numbers of CPUs. - This is useful when torture-testing RCU, but in almost no - other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_INIT_DELAY - int "How much to slow down RCU grace-period initialization" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_INIT - help - This option specifies the number of jiffies to wait between - each rcu_node structure initialization. - -config RCU_TORTURE_TEST_SLOW_CLEANUP - bool "Slow down RCU grace-period cleanup to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period cleanup for a few jiffies - between cleaning up each pair of consecutive rcu_node - structures. This helps to expose races involving grace-period - cleanup, in other words, it makes your kernel less stable. - It can also greatly increase grace-period latency, especially - on systems with large numbers of CPUs. This is useful when - torture-testing RCU, but in almost no other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY - int "How much to slow down RCU grace-period cleanup" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_CLEANUP - help - This option specifies the number of jiffies to wait between - each rcu_node structure cleanup operation. - -config RCU_CPU_STALL_TIMEOUT - int "RCU CPU stall timeout in seconds" - depends on RCU_STALL_COMMON - range 3 300 - default 21 - help - If a given RCU grace period extends more than the specified - number of seconds, a CPU stall warning is printed. If the - RCU grace period persists, additional CPU stall warnings are - printed at more widely spaced intervals. - -config RCU_TRACE - bool "Enable tracing for RCU" - depends on DEBUG_KERNEL - default y if TREE_RCU - select TRACE_CLOCK - help - This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. It also enables - additional tracepoints for ftrace-style event tracing. - - Say Y here if you want to enable RCU tracing - Say N if you are unsure. - -config RCU_EQS_DEBUG - bool "Provide debugging asserts for adding NO_HZ support to an arch" - depends on DEBUG_KERNEL - help - This option provides consistency checks in RCU's handling of - NO_HZ. These checks have proven quite helpful in detecting - bugs in arch-specific NO_HZ code. - - Say N here if you need ultimate kernel/user switch latencies - Say Y if you are unsure - -endmenu # "RCU Debugging" +source "kernel/rcu/Kconfig.debug" config DEBUG_WQ_FORCE_RR_CPU bool "Force round-robin CPU selection for unbound work items" diff --git a/lib/Makefile b/lib/Makefile index 519782d9ca3f..0e96b8f9c60d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,9 +25,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ earlycpio.o seq_buf.o siphash.o \ nmi_backtrace.o nodemask.o win_minmax.o -CFLAGS_radix-tree.o += -DCONFIG_SPARSE_RCU_POINTER -CFLAGS_idr.o += -DCONFIG_SPARSE_RCU_POINTER - lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_DMA_NOOP_OPS) += dma-noop.o diff --git a/lib/cpumask.c b/lib/cpumask.c index 81dedaab36cc..4731a0895760 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } EXPORT_SYMBOL(cpumask_any_but); +/** + * cpumask_next_wrap - helper to implement for_each_cpu_wrap + * @n: the cpu prior to the place to search + * @mask: the cpumask pointer + * @start: the start point of the iteration + * @wrap: assume @n crossing @start terminates the iteration + * + * Returns >= nr_cpu_ids on completion + * + * Note: the @wrap argument is required for the start condition when + * we cannot assume @start is set in @mask. + */ +int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +{ + int next; + +again: + next = cpumask_next(n, mask); + + if (wrap && n < start && next >= start) { + return nr_cpumask_bits; + + } else if (next >= nr_cpumask_bits) { + wrap = true; + n = -1; + goto again; + } + + return next; +} +EXPORT_SYMBOL(cpumask_next_wrap); + /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** diff --git a/lib/locking-selftest-rtmutex.h b/lib/locking-selftest-rtmutex.h new file mode 100644 index 000000000000..e3cb83989d16 --- /dev/null +++ b/lib/locking-selftest-rtmutex.h @@ -0,0 +1,11 @@ +#undef LOCK +#define LOCK RTL + +#undef UNLOCK +#define UNLOCK RTU + +#undef RLOCK +#undef WLOCK + +#undef INIT +#define INIT RTI diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index f3a217ea0388..6f2b135dc5e8 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -21,6 +21,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/irqflags.h> +#include <linux/rtmutex.h> /* * Change this to 1 if you want to see the failure printouts: @@ -46,6 +47,7 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose); #define LOCKTYPE_MUTEX 0x4 #define LOCKTYPE_RWSEM 0x8 #define LOCKTYPE_WW 0x10 +#define LOCKTYPE_RTMUTEX 0x20 static struct ww_acquire_ctx t, t2; static struct ww_mutex o, o2, o3; @@ -74,6 +76,15 @@ static DECLARE_RWSEM(rwsem_B); static DECLARE_RWSEM(rwsem_C); static DECLARE_RWSEM(rwsem_D); +#ifdef CONFIG_RT_MUTEXES + +static DEFINE_RT_MUTEX(rtmutex_A); +static DEFINE_RT_MUTEX(rtmutex_B); +static DEFINE_RT_MUTEX(rtmutex_C); +static DEFINE_RT_MUTEX(rtmutex_D); + +#endif + /* * Locks that we initialize dynamically as well so that * e.g. X1 and X2 becomes two instances of the same class, @@ -108,6 +119,17 @@ static DECLARE_RWSEM(rwsem_Y2); static DECLARE_RWSEM(rwsem_Z1); static DECLARE_RWSEM(rwsem_Z2); +#ifdef CONFIG_RT_MUTEXES + +static DEFINE_RT_MUTEX(rtmutex_X1); +static DEFINE_RT_MUTEX(rtmutex_X2); +static DEFINE_RT_MUTEX(rtmutex_Y1); +static DEFINE_RT_MUTEX(rtmutex_Y2); +static DEFINE_RT_MUTEX(rtmutex_Z1); +static DEFINE_RT_MUTEX(rtmutex_Z2); + +#endif + /* * non-inlined runtime initializers, to let separate locks share * the same lock-class: @@ -129,6 +151,17 @@ INIT_CLASS_FUNC(Z) static void init_shared_classes(void) { +#ifdef CONFIG_RT_MUTEXES + static struct lock_class_key rt_X, rt_Y, rt_Z; + + __rt_mutex_init(&rtmutex_X1, __func__, &rt_X); + __rt_mutex_init(&rtmutex_X2, __func__, &rt_X); + __rt_mutex_init(&rtmutex_Y1, __func__, &rt_Y); + __rt_mutex_init(&rtmutex_Y2, __func__, &rt_Y); + __rt_mutex_init(&rtmutex_Z1, __func__, &rt_Z); + __rt_mutex_init(&rtmutex_Z2, __func__, &rt_Z); +#endif + init_class_X(&lock_X1, &rwlock_X1, &mutex_X1, &rwsem_X1); init_class_X(&lock_X2, &rwlock_X2, &mutex_X2, &rwsem_X2); @@ -193,6 +226,10 @@ static void init_shared_classes(void) #define MU(x) mutex_unlock(&mutex_##x) #define MI(x) mutex_init(&mutex_##x) +#define RTL(x) rt_mutex_lock(&rtmutex_##x) +#define RTU(x) rt_mutex_unlock(&rtmutex_##x) +#define RTI(x) rt_mutex_init(&rtmutex_##x) + #define WSL(x) down_write(&rwsem_##x) #define WSU(x) up_write(&rwsem_##x) @@ -264,6 +301,11 @@ GENERATE_TESTCASE(AA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(AA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(AA_rtmutex); +#endif + #undef E /* @@ -345,6 +387,11 @@ GENERATE_TESTCASE(ABBA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABBA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABBA_rtmutex); +#endif + #undef E /* @@ -373,6 +420,11 @@ GENERATE_TESTCASE(ABBCCA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABBCCA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABBCCA_rtmutex); +#endif + #undef E /* @@ -401,6 +453,11 @@ GENERATE_TESTCASE(ABCABC_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABCABC_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABCABC_rtmutex); +#endif + #undef E /* @@ -430,6 +487,11 @@ GENERATE_TESTCASE(ABBCCDDA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABBCCDDA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABBCCDDA_rtmutex); +#endif + #undef E /* @@ -458,6 +520,11 @@ GENERATE_TESTCASE(ABCDBDDA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABCDBDDA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABCDBDDA_rtmutex); +#endif + #undef E /* @@ -486,6 +553,11 @@ GENERATE_TESTCASE(ABCDBCDA_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(ABCDBCDA_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(ABCDBCDA_rtmutex); +#endif + #undef E /* @@ -513,33 +585,10 @@ GENERATE_TESTCASE(double_unlock_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(double_unlock_rsem) -#undef E - -/* - * Bad unlock ordering: - */ -#define E() \ - \ - LOCK(A); \ - LOCK(B); \ - UNLOCK(A); /* fail */ \ - UNLOCK(B); - -/* - * 6 testcases: - */ -#include "locking-selftest-spin.h" -GENERATE_TESTCASE(bad_unlock_order_spin) -#include "locking-selftest-wlock.h" -GENERATE_TESTCASE(bad_unlock_order_wlock) -#include "locking-selftest-rlock.h" -GENERATE_TESTCASE(bad_unlock_order_rlock) -#include "locking-selftest-mutex.h" -GENERATE_TESTCASE(bad_unlock_order_mutex) -#include "locking-selftest-wsem.h" -GENERATE_TESTCASE(bad_unlock_order_wsem) -#include "locking-selftest-rsem.h" -GENERATE_TESTCASE(bad_unlock_order_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(double_unlock_rtmutex); +#endif #undef E @@ -567,6 +616,11 @@ GENERATE_TESTCASE(init_held_wsem) #include "locking-selftest-rsem.h" GENERATE_TESTCASE(init_held_rsem) +#ifdef CONFIG_RT_MUTEXES +#include "locking-selftest-rtmutex.h" +GENERATE_TESTCASE(init_held_rtmutex); +#endif + #undef E /* @@ -916,6 +970,9 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) # define I_WW(x) lockdep_reset_lock(&x.dep_map) +#ifdef CONFIG_RT_MUTEXES +# define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) +#endif #else # define I_SPINLOCK(x) # define I_RWLOCK(x) @@ -924,12 +981,23 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) # define I_WW(x) #endif +#ifndef I_RTMUTEX +# define I_RTMUTEX(x) +#endif + +#ifdef CONFIG_RT_MUTEXES +#define I2_RTMUTEX(x) rt_mutex_init(&rtmutex_##x) +#else +#define I2_RTMUTEX(x) +#endif + #define I1(x) \ do { \ I_SPINLOCK(x); \ I_RWLOCK(x); \ I_MUTEX(x); \ I_RWSEM(x); \ + I_RTMUTEX(x); \ } while (0) #define I2(x) \ @@ -938,6 +1006,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) rwlock_init(&rwlock_##x); \ mutex_init(&mutex_##x); \ init_rwsem(&rwsem_##x); \ + I2_RTMUTEX(x); \ } while (0) static void reset_locks(void) @@ -1013,6 +1082,12 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) reset_locks(); } +#ifdef CONFIG_RT_MUTEXES +#define dotest_rt(fn, e, m) dotest((fn), (e), (m)) +#else +#define dotest_rt(fn, e, m) +#endif + static inline void print_testname(const char *testname) { printk("%33s:", testname); @@ -1050,6 +1125,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ + dotest_rt(name##_rtmutex, FAILURE, LOCKTYPE_RTMUTEX); \ pr_cont("\n"); #define DO_TESTCASE_6_SUCCESS(desc, name) \ @@ -1060,6 +1136,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \ dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \ dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \ + dotest_rt(name##_rtmutex, SUCCESS, LOCKTYPE_RTMUTEX); \ pr_cont("\n"); /* @@ -1073,6 +1150,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ + dotest_rt(name##_rtmutex, FAILURE, LOCKTYPE_RTMUTEX); \ pr_cont("\n"); #define DO_TESTCASE_2I(desc, name, nr) \ @@ -1825,7 +1903,6 @@ void locking_selftest(void) DO_TESTCASE_6R("A-B-C-D-B-C-D-A deadlock", ABCDBCDA); DO_TESTCASE_6("double unlock", double_unlock); DO_TESTCASE_6("initialize held", init_held); - DO_TESTCASE_6_SUCCESS("bad unlock order", bad_unlock_order); printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); diff --git a/lib/refcount.c b/lib/refcount.c index 9f906783987e..5d0582a9480c 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -37,6 +37,8 @@ #include <linux/refcount.h> #include <linux/bug.h> +#ifdef CONFIG_REFCOUNT_FULL + /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount @@ -225,6 +227,7 @@ void refcount_dec(refcount_t *r) WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n"); } EXPORT_SYMBOL(refcount_dec); +#endif /* CONFIG_REFCOUNT_FULL */ /** * refcount_dec_if_one - decrement a refcount if it is 1 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 690d75b132fa..2fb007be0212 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1, /* * It is valid to assume CPU-locality during early bootup: */ - if (system_state != SYSTEM_RUNNING) + if (system_state < SYSTEM_SCHEDULING) goto out; /* diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..398b46064544 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK diff --git a/mm/filemap.c b/mm/filemap.c index acbc0e28ad47..9236b94c446d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -879,10 +879,10 @@ struct wait_page_key { struct wait_page_queue { struct page *page; int bit_nr; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page @@ -945,7 +945,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) { struct wait_page_queue wait_page; - wait_queue_t *wait = &wait_page.wait; + wait_queue_entry_t *wait = &wait_page.wait; int ret = 0; init_wait(wait); @@ -956,9 +956,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, for (;;) { spin_lock_irq(&q->lock); - if (likely(list_empty(&wait->task_list))) { + if (likely(list_empty(&wait->entry))) { if (lock) - __add_wait_queue_tail_exclusive(q, wait); + __add_wait_queue_entry_tail_exclusive(q, wait); else __add_wait_queue(q, wait); SetPageWaiters(page); @@ -1018,7 +1018,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; @@ -1146,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1167,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1178,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP #ifndef gup_get_pte /* @@ -1668,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..d75b38b66ef6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event { */ poll_table pt; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct remove; }; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { struct mem_cgroup *memcg; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + INIT_LIST_HEAD(&owait.wait.entry); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mem_cgroup_mark_under_oom(memcg); @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct mem_cgroup_event *event = diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..130c238fe384 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - int cpu; if (!tlb_ubc->flush_required) return; - cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - } - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); - cpumask_clear(&tlb_ubc->cpumask); + arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; - put_cpu(); } /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; /* diff --git a/mm/shmem.c b/mm/shmem.c index 391f2dcca727..9100c4952698 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1903,10 +1903,10 @@ unlock: * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); return ret; } @@ -2841,7 +2841,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); - WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..c3c1c6ac62da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3652,7 +3652,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 7bc2208b6cc4..dca3cdd1a014 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -95,7 +95,7 @@ enum { struct p9_poll_wait { struct p9_conn *conn; - wait_queue_t wait; + wait_queue_entry_t wait; wait_queue_head_t *wait_addr; }; @@ -522,7 +522,7 @@ error: clear_bit(Wworksched, &m->wsched); } -static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key) +static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct p9_poll_wait *pwait = container_of(wait, struct p9_poll_wait, wait); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 9a40013da915..30a07be6a110 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -481,7 +481,7 @@ static int bnep_session(void *arg) struct net_device *dev = s->dev; struct sock *sk = s->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + wait_queue_entry_t wait; BT_DBG(""); diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index f4c64ef01c24..19fc02c34048 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -280,7 +280,7 @@ static int cmtp_session(void *arg) struct cmtp_session *session = arg; struct sock *sk = session->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + wait_queue_entry_t wait; BT_DBG("session %p", session); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 961f7f53e178..f24fcb9467a5 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session) static int hidp_session_thread(void *arg) { struct hidp_session *session = arg; - wait_queue_t ctrl_wait, intr_wait; + wait_queue_entry_t ctrl_wait, intr_wait; BT_DBG("session %p", session); diff --git a/net/core/datagram.c b/net/core/datagram.c index ef342800828f..accbcb7ef74c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } -static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { unsigned long bits = (unsigned long)key; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1a0c961f4ffe..c77ced0109b7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -343,7 +343,7 @@ found: * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * - * In order to propagate a wake up, a wait_queue_t of the client + * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or @@ -352,7 +352,7 @@ found: * was relayed. */ -static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, +static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3465a7c5a154..f4ac18970e7e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5534,23 +5534,6 @@ sub process { } } -# Check for expedited grace periods that interrupt non-idle non-nohz -# online CPUs. These expedited can therefore degrade real-time response -# if used carelessly, and should be avoided where not absolutely -# needed. It is always OK to use synchronize_rcu_expedited() and -# synchronize_sched_expedited() at boot time (before real-time applications -# start) and in error situations where real-time response is compromised in -# any case. Note that synchronize_srcu_expedited() does -not- interrupt -# other CPUs, so don't warn on uses of synchronize_srcu_expedited(). -# Of course, nothing comes for free, and srcu_read_lock() and -# srcu_read_unlock() do contain full memory barriers in payment for -# synchronize_srcu_expedited() non-interruption properties. - if ($line =~ /\b(synchronize_rcu_expedited|synchronize_sched_expedited)\(/) { - WARN("EXPEDITED_RCU_GRACE_PERIOD", - "expedited RCU grace periods should be avoided where they can degrade real-time response\n" . $herecurr); - - } - # check of hardware specific defines if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) { CHK("ARCH_DEFINES", diff --git a/security/keys/internal.h b/security/keys/internal.h index c0f8682eba69..91bc6214ae57 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -13,6 +13,7 @@ #define _INTERNAL_H #include <linux/sched.h> +#include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/key-type.h> #include <linux/task_work.h> diff --git a/sound/core/control.c b/sound/core/control.c index ecd358213b83..3c6be1452e35 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1557,7 +1557,7 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer, struct snd_ctl_event ev; struct snd_kctl_event *kev; while (list_empty(&ctl->events)) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; goto __end_lock; diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c index 9602a7e38d8a..a73baa1242be 100644 --- a/sound/core/hwdep.c +++ b/sound/core/hwdep.c @@ -85,7 +85,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file) int major = imajor(inode); struct snd_hwdep *hw; int err; - wait_queue_t wait; + wait_queue_entry_t wait; if (major == snd_major) { hw = snd_lookup_minor_data(iminor(inode), diff --git a/sound/core/init.c b/sound/core/init.c index 00f2cbb76e69..b4365bcf28a7 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -984,7 +984,7 @@ EXPORT_SYMBOL(snd_card_file_remove); */ int snd_power_wait(struct snd_card *card, unsigned int power_state) { - wait_queue_t wait; + wait_queue_entry_t wait; int result = 0; /* fastpath */ diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 5e1009d959a8..e49f448ee04f 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1518,7 +1518,7 @@ static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size) ssize_t result = 0; snd_pcm_state_t state; long res; - wait_queue_t wait; + wait_queue_entry_t wait; runtime = substream->runtime; init_waitqueue_entry(&wait, current); @@ -2334,7 +2334,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file) struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_oss_setup setup[2]; int nonblock; - wait_queue_t wait; + wait_queue_entry_t wait; err = nonseekable_open(inode, file); if (err < 0) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index e76d55a4d1b2..a93a4235a332 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1826,7 +1826,7 @@ static int wait_for_avail(struct snd_pcm_substream *substream, { struct snd_pcm_runtime *runtime = substream->runtime; int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; - wait_queue_t wait; + wait_queue_entry_t wait; int err = 0; snd_pcm_uframes_t avail = 0; long wait_time, tout; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9ade0c8b54a3..0fbeb2487b7c 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -1776,7 +1776,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream, struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_pcm_substream *s; - wait_queue_t wait; + wait_queue_entry_t wait; int result = 0; int nonblock = 0; @@ -2467,7 +2467,7 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file) static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) { int err; - wait_queue_t wait; + wait_queue_entry_t wait; if (pcm == NULL) { err = -ENODEV; diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 153d78bc79c0..b3b353d72527 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -368,7 +368,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) int err; struct snd_rawmidi *rmidi; struct snd_rawmidi_file *rawmidi_file = NULL; - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK)) return -EINVAL; /* invalid combination */ @@ -1002,7 +1002,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun while (count > 0) { spin_lock_irq(&runtime->lock); while (!snd_rawmidi_ready(substream)) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { spin_unlock_irq(&runtime->lock); return result > 0 ? result : -EAGAIN; @@ -1306,7 +1306,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, while (count > 0) { spin_lock_irq(&runtime->lock); while (!snd_rawmidi_ready_append(substream, count)) { - wait_queue_t wait; + wait_queue_entry_t wait; if (file->f_flags & O_NONBLOCK) { spin_unlock_irq(&runtime->lock); return result > 0 ? result : -EAGAIN; @@ -1338,7 +1338,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, if (file->f_flags & O_DSYNC) { spin_lock_irq(&runtime->lock); while (runtime->avail != runtime->buffer_size) { - wait_queue_t wait; + wait_queue_entry_t wait; unsigned int last_avail = runtime->avail; init_waitqueue_entry(&wait, current); add_wait_queue(&runtime->sleep, &wait); diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index 01c4cfe30c9f..a8c2822e0198 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -179,7 +179,7 @@ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, { struct snd_seq_event_cell *cell; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; if (snd_BUG_ON(!f)) return -EINVAL; diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c index 512f78ea13da..f763682584a8 100644 --- a/sound/core/seq/seq_memory.c +++ b/sound/core/seq/seq_memory.c @@ -225,7 +225,7 @@ static int snd_seq_cell_alloc(struct snd_seq_pool *pool, struct snd_seq_event_cell *cell; unsigned long flags; int err = -EAGAIN; - wait_queue_t wait; + wait_queue_entry_t wait; if (pool == NULL) return -EINVAL; diff --git a/sound/core/timer.c b/sound/core/timer.c index 4888203b2dbc..a9b9a277e00c 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1971,7 +1971,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, spin_lock_irq(&tu->qlock); while ((long)count - result >= unit) { while (!tu->qused) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c index 4dae9ff9ef5a..0b1e4b34b299 100644 --- a/sound/isa/wavefront/wavefront_synth.c +++ b/sound/isa/wavefront/wavefront_synth.c @@ -1782,7 +1782,7 @@ wavefront_should_cause_interrupt (snd_wavefront_t *dev, int val, int port, unsigned long timeout) { - wait_queue_t wait; + wait_queue_entry_t wait; init_waitqueue_entry(&wait, current); spin_lock_irq(&dev->irq_lock); diff --git a/sound/pci/mixart/mixart_core.c b/sound/pci/mixart/mixart_core.c index dccf3db48fe0..8bf2ce32d4a8 100644 --- a/sound/pci/mixart/mixart_core.c +++ b/sound/pci/mixart/mixart_core.c @@ -239,7 +239,7 @@ int snd_mixart_send_msg(struct mixart_mgr *mgr, struct mixart_msg *request, int struct mixart_msg resp; u32 msg_frame = 0; /* set to 0, so it's no notification to wait for, but the answer */ int err; - wait_queue_t wait; + wait_queue_entry_t wait; long timeout; init_waitqueue_entry(&wait, current); @@ -284,7 +284,7 @@ int snd_mixart_send_msg_wait_notif(struct mixart_mgr *mgr, struct mixart_msg *request, u32 notif_event) { int err; - wait_queue_t wait; + wait_queue_entry_t wait; long timeout; if (snd_BUG_ON(!notif_event)) diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c index fe4ba463b57c..1114166c685c 100644 --- a/sound/pci/ymfpci/ymfpci_main.c +++ b/sound/pci/ymfpci/ymfpci_main.c @@ -781,7 +781,7 @@ static snd_pcm_uframes_t snd_ymfpci_capture_pointer(struct snd_pcm_substream *su static void snd_ymfpci_irq_wait(struct snd_ymfpci *chip) { - wait_queue_t wait; + wait_queue_entry_t wait; int loops = 4; while (loops-- > 0) { diff --git a/tools/Makefile b/tools/Makefile index c8a90d01dd8e..221e1ce78b06 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -19,6 +19,7 @@ help: @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' @echo ' lguest - a minimal 32-bit x86 hypervisor' + @echo ' liblockdep - user-space wrapper for kernel locking-validator' @echo ' net - misc networking tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' @@ -89,7 +90,7 @@ freefall: FORCE kvm_stat: FORCE $(call descend,kvm/$@) -all: acpi cgroup cpupower gpio hv firewire lguest \ +all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \ perf selftests turbostat usb \ virtio vm net x86_energy_perf_policy \ tmon freefall objtool kvm_stat @@ -103,6 +104,9 @@ cpupower_install: cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install: $(call descend,$(@:_install=),install) +liblockdep_install: + $(call descend,lib/lockdep,install) + selftests_install: $(call descend,testing/$(@:_install=),install) @@ -119,7 +123,7 @@ kvm_stat_install: $(call descend,kvm/$(@:_install=),install) install: acpi_install cgroup_install cpupower_install gpio_install \ - hv_install firewire_install lguest_install \ + hv_install firewire_install lguest_install liblockdep_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install vm_install net_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install diff --git a/tools/include/asm/sections.h b/tools/include/asm/sections.h new file mode 100644 index 000000000000..a80643d7a7f1 --- /dev/null +++ b/tools/include/asm/sections.h @@ -0,0 +1,4 @@ +#ifndef __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H +#define __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H + +#endif /* __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H */ diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 1aecad369af5..969db1981868 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -61,4 +61,14 @@ static inline unsigned fls_long(unsigned long l) return fls64(l); } +/** + * rol32 - rotate a 32-bit value left + * @word: value to rotate + * @shift: bits to roll + */ +static inline __u32 rol32(__u32 word, unsigned int shift) +{ + return (word << shift) | (word >> ((-shift) & 31)); +} + #endif diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h index 825d44f89a29..bd39b2090ad1 100644 --- a/tools/include/linux/compiler-gcc.h +++ b/tools/include/linux/compiler-gcc.h @@ -19,3 +19,13 @@ /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) + +#define noinline __attribute__((noinline)) + +#define __packed __attribute__((packed)) + +#define __noreturn __attribute__((noreturn)) + +#define __aligned(x) __attribute__((aligned(x))) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 23299d7e7160..d7a5604c38d7 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -17,6 +17,10 @@ # define __always_inline inline __attribute__((always_inline)) #endif +#ifndef noinline +#define noinline +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) @@ -45,6 +49,10 @@ # define __maybe_unused __attribute__((unused)) #endif +#ifndef __used +# define __used __attribute__((__unused__)) +#endif + #ifndef __packed # define __packed __attribute__((__packed__)) #endif @@ -65,6 +73,14 @@ # define unlikely(x) __builtin_expect(!!(x), 0) #endif +#ifndef __init +# define __init +#endif + +#ifndef noinline +# define noinline +#endif + #define uninitialized_var(x) x = *(&(x)) #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) diff --git a/tools/lib/lockdep/uinclude/linux/debug_locks.h b/tools/include/linux/debug_locks.h index f38eb64df794..61cc7f501168 100644 --- a/tools/lib/lockdep/uinclude/linux/debug_locks.h +++ b/tools/include/linux/debug_locks.h @@ -3,8 +3,9 @@ #include <stddef.h> #include <linux/compiler.h> +#include <asm/bug.h> -#define DEBUG_LOCKS_WARN_ON(x) (x) +#define DEBUG_LOCKS_WARN_ON(x) WARN_ON(x) extern bool debug_locks; extern bool debug_locks_silent; diff --git a/tools/include/linux/delay.h b/tools/include/linux/delay.h new file mode 100644 index 000000000000..55aa4173af1f --- /dev/null +++ b/tools/include/linux/delay.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_DELAY_H +#define _TOOLS_INCLUDE_LINUX_DELAY_H + +#endif /* _TOOLS_INCLUDE_LINUX_DELAY_H */ diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h index bdc3dd8131d4..abf0478a8fb2 100644 --- a/tools/include/linux/err.h +++ b/tools/include/linux/err.h @@ -46,4 +46,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr) return IS_ERR_VALUE((unsigned long)ptr); } +static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) +{ + return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); +} + #endif /* _LINUX_ERR_H */ diff --git a/tools/include/linux/ftrace.h b/tools/include/linux/ftrace.h new file mode 100644 index 000000000000..949f541ce11e --- /dev/null +++ b/tools/include/linux/ftrace.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_FTRACE_H +#define _TOOLS_INCLUDE_LINUX_FTRACE_H + +#endif /* _TOOLS_INCLUDE_LINUX_FTRACE_H */ diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h new file mode 100644 index 000000000000..22030756fbc0 --- /dev/null +++ b/tools/include/linux/gfp.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_GFP_H +#define _TOOLS_INCLUDE_LINUX_GFP_H + +#endif /* _TOOLS_INCLUDE_LINUX_GFP_H */ diff --git a/tools/lib/lockdep/uinclude/linux/hardirq.h b/tools/include/linux/hardirq.h index c8f3f8f58729..c8f3f8f58729 100644 --- a/tools/lib/lockdep/uinclude/linux/hardirq.h +++ b/tools/include/linux/hardirq.h diff --git a/tools/include/linux/interrupt.h b/tools/include/linux/interrupt.h new file mode 100644 index 000000000000..6be25bbdca9e --- /dev/null +++ b/tools/include/linux/interrupt.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_INTERRUPT_H +#define _TOOLS_INCLUDE_LINUX_INTERRUPT_H + +#endif /* _TOOLS_INCLUDE_LINUX_INTERRUPT_H */ diff --git a/tools/lib/lockdep/uinclude/linux/irqflags.h b/tools/include/linux/irqflags.h index 6cc296f0fad0..df77669cfe1c 100644 --- a/tools/lib/lockdep/uinclude/linux/irqflags.h +++ b/tools/include/linux/irqflags.h @@ -17,19 +17,19 @@ #define raw_local_irq_disable() do { } while (0) #define raw_local_irq_enable() do { } while (0) #define raw_local_irq_save(flags) ((flags) = 0) -#define raw_local_irq_restore(flags) do { } while (0) +#define raw_local_irq_restore(flags) ((void)(flags)) #define raw_local_save_flags(flags) ((flags) = 0) -#define raw_irqs_disabled_flags(flags) do { } while (0) +#define raw_irqs_disabled_flags(flags) ((void)(flags)) #define raw_irqs_disabled() 0 #define raw_safe_halt() #define local_irq_enable() do { } while (0) #define local_irq_disable() do { } while (0) #define local_irq_save(flags) ((flags) = 0) -#define local_irq_restore(flags) do { } while (0) +#define local_irq_restore(flags) ((void)(flags)) #define local_save_flags(flags) ((flags) = 0) #define irqs_disabled() (1) -#define irqs_disabled_flags(flags) (0) +#define irqs_disabled_flags(flags) ((void)(flags), 0) #define safe_halt() do { } while (0) #define trace_lock_release(x, y) diff --git a/tools/include/linux/jhash.h b/tools/include/linux/jhash.h new file mode 100644 index 000000000000..348c6f47e4cc --- /dev/null +++ b/tools/include/linux/jhash.h @@ -0,0 +1,175 @@ +#ifndef _LINUX_JHASH_H +#define _LINUX_JHASH_H + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose. It's in + * the public domain. It has no warranty. + * + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) + * + * I've modified Bob's hash to be useful in the Linux kernel, and + * any bugs present are my fault. + * Jozsef + */ +#include <linux/bitops.h> +#include <linux/unaligned/packed_struct.h> + +/* Best hash sizes are of power of two */ +#define jhash_size(n) ((u32)1<<(n)) +/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ +#define jhash_mask(n) (jhash_size(n)-1) + +/* __jhash_mix -- mix 3 32-bit values reversibly. */ +#define __jhash_mix(a, b, c) \ +{ \ + a -= c; a ^= rol32(c, 4); c += b; \ + b -= a; b ^= rol32(a, 6); a += c; \ + c -= b; c ^= rol32(b, 8); b += a; \ + a -= c; a ^= rol32(c, 16); c += b; \ + b -= a; b ^= rol32(a, 19); a += c; \ + c -= b; c ^= rol32(b, 4); b += a; \ +} + +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ +#define __jhash_final(a, b, c) \ +{ \ + c ^= b; c -= rol32(b, 14); \ + a ^= c; a -= rol32(c, 11); \ + b ^= a; b -= rol32(a, 25); \ + c ^= b; c -= rol32(b, 16); \ + a ^= c; a -= rol32(c, 4); \ + b ^= a; b -= rol32(a, 14); \ + c ^= b; c -= rol32(b, 24); \ +} + +/* An arbitrary initial parameter */ +#define JHASH_INITVAL 0xdeadbeef + +/* jhash - hash an arbitrary key + * @k: sequence of bytes as key + * @length: the length of the key + * @initval: the previous hash, or an arbitray value + * + * The generic version, hashes an arbitrary sequence of bytes. + * No alignment or length assumptions are made about the input key. + * + * Returns the hash value of the key. The result depends on endianness. + */ +static inline u32 jhash(const void *key, u32 length, u32 initval) +{ + u32 a, b, c; + const u8 *k = key; + + /* Set up the internal state */ + a = b = c = JHASH_INITVAL + length + initval; + + /* All but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) { + a += __get_unaligned_cpu32(k); + b += __get_unaligned_cpu32(k + 4); + c += __get_unaligned_cpu32(k + 8); + __jhash_mix(a, b, c); + length -= 12; + k += 12; + } + /* Last block: affect all 32 bits of (c) */ + /* All the case statements fall through */ + switch (length) { + case 12: c += (u32)k[11]<<24; + case 11: c += (u32)k[10]<<16; + case 10: c += (u32)k[9]<<8; + case 9: c += k[8]; + case 8: b += (u32)k[7]<<24; + case 7: b += (u32)k[6]<<16; + case 6: b += (u32)k[5]<<8; + case 5: b += k[4]; + case 4: a += (u32)k[3]<<24; + case 3: a += (u32)k[2]<<16; + case 2: a += (u32)k[1]<<8; + case 1: a += k[0]; + __jhash_final(a, b, c); + case 0: /* Nothing left to add */ + break; + } + + return c; +} + +/* jhash2 - hash an array of u32's + * @k: the key which must be an array of u32's + * @length: the number of u32's in the key + * @initval: the previous hash, or an arbitray value + * + * Returns the hash value of the key. + */ +static inline u32 jhash2(const u32 *k, u32 length, u32 initval) +{ + u32 a, b, c; + + /* Set up the internal state */ + a = b = c = JHASH_INITVAL + (length<<2) + initval; + + /* Handle most of the key */ + while (length > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + length -= 3; + k += 3; + } + + /* Handle the last 3 u32's: all the case statements fall through */ + switch (length) { + case 3: c += k[2]; + case 2: b += k[1]; + case 1: a += k[0]; + __jhash_final(a, b, c); + case 0: /* Nothing left to add */ + break; + } + + return c; +} + + +/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ +static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) +{ + a += initval; + b += initval; + c += initval; + + __jhash_final(a, b, c); + + return c; +} + +static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) +{ + return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); +} + +static inline u32 jhash_2words(u32 a, u32 b, u32 initval) +{ + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); +} + +static inline u32 jhash_1word(u32 a, u32 initval) +{ + return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); +} + +#endif /* _LINUX_JHASH_H */ diff --git a/tools/lib/lockdep/uinclude/linux/kallsyms.h b/tools/include/linux/kallsyms.h index b0f2dbdf1a15..582cc1e5f3a4 100644 --- a/tools/lib/lockdep/uinclude/linux/kallsyms.h +++ b/tools/include/linux/kallsyms.h @@ -3,6 +3,7 @@ #include <linux/kernel.h> #include <stdio.h> +#include <unistd.h> #define KSYM_NAME_LEN 128 @@ -24,7 +25,7 @@ static inline void print_ip_sym(unsigned long ip) name = backtrace_symbols((void **)&ip, 1); - printf("%s\n", *name); + dprintf(STDOUT_FILENO, "%s\n", *name); free(name); } diff --git a/tools/lib/lockdep/uinclude/linux/kern_levels.h b/tools/include/linux/kern_levels.h index 3b9bade28698..3b9bade28698 100644 --- a/tools/lib/lockdep/uinclude/linux/kern_levels.h +++ b/tools/include/linux/kern_levels.h diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 73ccc48126bb..801b927499f2 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -32,6 +32,7 @@ (type *)((char *)__mptr - offsetof(type, member)); }) #endif +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) #ifndef max @@ -89,4 +90,7 @@ int scnprintf(char * buf, size_t size, const char * fmt, ...); #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) #define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define current_gfp_context(k) 0 +#define synchronize_sched() + #endif diff --git a/tools/lib/lockdep/uinclude/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h index 94d598bc6abe..94d598bc6abe 100644 --- a/tools/lib/lockdep/uinclude/linux/kmemcheck.h +++ b/tools/include/linux/kmemcheck.h diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h new file mode 100644 index 000000000000..bc763d500262 --- /dev/null +++ b/tools/include/linux/linkage.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H +#define _TOOLS_INCLUDE_LINUX_LINKAGE_H + +#endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */ diff --git a/tools/lib/lockdep/uinclude/linux/lockdep.h b/tools/include/linux/lockdep.h index c808c7d02d21..8da3e8effafa 100644 --- a/tools/lib/lockdep/uinclude/linux/lockdep.h +++ b/tools/include/linux/lockdep.h @@ -7,8 +7,15 @@ #include <limits.h> #include <linux/utsname.h> #include <linux/compiler.h> +#include <linux/export.h> +#include <linux/kern_levels.h> +#include <linux/err.h> +#include <linux/rcu.h> +#include <linux/list.h> +#include <linux/hardirq.h> +#include <unistd.h> -#define MAX_LOCK_DEPTH 2000UL +#define MAX_LOCK_DEPTH 63UL #define asmlinkage #define __visible @@ -29,31 +36,32 @@ extern struct task_struct *__curr(void); #define current (__curr()) -#define debug_locks_off() 1 +static inline int debug_locks_off(void) +{ + return 1; +} + #define task_pid_nr(tsk) ((tsk)->pid) #define KSYM_NAME_LEN 128 -#define printk printf +#define printk(...) dprintf(STDOUT_FILENO, __VA_ARGS__) +#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#define pr_warn pr_err #define list_del_rcu list_del #define atomic_t unsigned long #define atomic_inc(x) ((*(x))++) -static struct new_utsname *init_utsname(void) -{ - static struct new_utsname n = (struct new_utsname) { - .release = "liblockdep", - .version = LIBLOCKDEP_VERSION, - }; - - return &n; -} - #define print_tainted() "" #define static_obj(x) 1 #define debug_show_all_locks() extern void debug_check_no_locks_held(void); +static __used bool __is_kernel_percpu_address(unsigned long addr, void *can_addr) +{ + return false; +} + #endif diff --git a/tools/lib/lockdep/uinclude/linux/module.h b/tools/include/linux/module.h index 09c7a7be8ccc..07055db296f3 100644 --- a/tools/lib/lockdep/uinclude/linux/module.h +++ b/tools/include/linux/module.h @@ -3,4 +3,9 @@ #define module_param(name, type, perm) +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + #endif diff --git a/tools/include/linux/mutex.h b/tools/include/linux/mutex.h new file mode 100644 index 000000000000..a8180d25f2fc --- /dev/null +++ b/tools/include/linux/mutex.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_MUTEX_H +#define _TOOLS_INCLUDE_LINUX_MUTEX_H + +#endif /* _TOOLS_INCLUDE_LINUX_MUTEX_H */ diff --git a/tools/include/linux/proc_fs.h b/tools/include/linux/proc_fs.h new file mode 100644 index 000000000000..8b3b03b64fda --- /dev/null +++ b/tools/include/linux/proc_fs.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_PROC_FS_H +#define _TOOLS_INCLUDE_LINUX_PROC_FS_H + +#endif /* _TOOLS_INCLUDE_LINUX_PROC_FS_H */ diff --git a/tools/lib/lockdep/uinclude/linux/rcu.h b/tools/include/linux/rcu.h index 042ee8e463c9..5080649dad04 100644 --- a/tools/lib/lockdep/uinclude/linux/rcu.h +++ b/tools/include/linux/rcu.h @@ -18,4 +18,7 @@ static inline bool rcu_is_watching(void) return false; } +#define rcu_assign_pointer(p, v) ((p) = (v)) +#define RCU_INIT_POINTER(p, v) p=(v) + #endif diff --git a/tools/include/linux/sched/clock.h b/tools/include/linux/sched/clock.h new file mode 100644 index 000000000000..5837d17c4182 --- /dev/null +++ b/tools/include/linux/sched/clock.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_PERF_LINUX_SCHED_CLOCK_H +#define _TOOLS_PERF_LINUX_SCHED_CLOCK_H + +#endif /* _TOOLS_PERF_LINUX_SCHED_CLOCK_H */ diff --git a/tools/include/linux/sched/mm.h b/tools/include/linux/sched/mm.h new file mode 100644 index 000000000000..c8d9f19c1f35 --- /dev/null +++ b/tools/include/linux/sched/mm.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_PERF_LINUX_SCHED_MM_H +#define _TOOLS_PERF_LINUX_SCHED_MM_H + +#endif /* _TOOLS_PERF_LINUX_SCHED_MM_H */ diff --git a/tools/include/linux/sched/task.h b/tools/include/linux/sched/task.h new file mode 100644 index 000000000000..a97890eca110 --- /dev/null +++ b/tools/include/linux/sched/task.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_PERF_LINUX_SCHED_TASK_H +#define _TOOLS_PERF_LINUX_SCHED_TASK_H + +#endif /* _TOOLS_PERF_LINUX_SCHED_TASK_H */ diff --git a/tools/include/linux/seq_file.h b/tools/include/linux/seq_file.h new file mode 100644 index 000000000000..102fd9217f1f --- /dev/null +++ b/tools/include/linux/seq_file.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_LINUX_SEQ_FILE_H +#define _TOOLS_INCLUDE_LINUX_SEQ_FILE_H + +#endif /* _TOOLS_INCLUDE_LINUX_SEQ_FILE_H */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 58397dcb19d6..417cda4f793f 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -1,5 +1,31 @@ +#ifndef __LINUX_SPINLOCK_H_ +#define __LINUX_SPINLOCK_H_ + +#include <pthread.h> +#include <stdbool.h> + #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) + +#define arch_spinlock_t pthread_mutex_t +#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER + +static inline void arch_spin_lock(arch_spinlock_t *mutex) +{ + pthread_mutex_lock(mutex); +} + +static inline void arch_spin_unlock(arch_spinlock_t *mutex) +{ + pthread_mutex_unlock(mutex); +} + +static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) +{ + return true; +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/stacktrace.h b/tools/include/linux/stacktrace.h index 39aecc6b19d1..39aecc6b19d1 100644 --- a/tools/lib/lockdep/uinclude/linux/stacktrace.h +++ b/tools/include/linux/stacktrace.h diff --git a/tools/include/linux/unaligned/packed_struct.h b/tools/include/linux/unaligned/packed_struct.h new file mode 100644 index 000000000000..c0d817de4df2 --- /dev/null +++ b/tools/include/linux/unaligned/packed_struct.h @@ -0,0 +1,46 @@ +#ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H +#define _LINUX_UNALIGNED_PACKED_STRUCT_H + +#include <linux/kernel.h> + +struct __una_u16 { u16 x; } __packed; +struct __una_u32 { u32 x; } __packed; +struct __una_u64 { u64 x; } __packed; + +static inline u16 __get_unaligned_cpu16(const void *p) +{ + const struct __una_u16 *ptr = (const struct __una_u16 *)p; + return ptr->x; +} + +static inline u32 __get_unaligned_cpu32(const void *p) +{ + const struct __una_u32 *ptr = (const struct __una_u32 *)p; + return ptr->x; +} + +static inline u64 __get_unaligned_cpu64(const void *p) +{ + const struct __una_u64 *ptr = (const struct __una_u64 *)p; + return ptr->x; +} + +static inline void __put_unaligned_cpu16(u16 val, void *p) +{ + struct __una_u16 *ptr = (struct __una_u16 *)p; + ptr->x = val; +} + +static inline void __put_unaligned_cpu32(u32 val, void *p) +{ + struct __una_u32 *ptr = (struct __una_u32 *)p; + ptr->x = val; +} + +static inline void __put_unaligned_cpu64(u64 val, void *p) +{ + struct __una_u64 *ptr = (struct __una_u64 *)p; + ptr->x = val; +} + +#endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */ diff --git a/tools/include/trace/events/lock.h b/tools/include/trace/events/lock.h new file mode 100644 index 000000000000..5b15fd5ee1af --- /dev/null +++ b/tools/include/trace/events/lock.h @@ -0,0 +1,4 @@ +#ifndef _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H +#define _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H + +#endif /* _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H */ diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 809c7721cd24..a7ecf8f469f4 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -387,6 +387,22 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep) return err; } +int filename__write_int(const char *filename, int value) +{ + int fd = open(filename, O_WRONLY), err = -1; + char buf[64]; + + if (fd < 0) + return err; + + sprintf(buf, "%d", value); + if (write(fd, buf, sizeof(buf)) == sizeof(buf)) + err = 0; + + close(fd); + return err; +} + int procfs__read_str(const char *entry, char **buf, size_t *sizep) { char path[PATH_MAX]; @@ -480,3 +496,17 @@ int sysctl__read_int(const char *sysctl, int *value) return filename__read_int(path, value); } + +int sysfs__write_int(const char *entry, int value) +{ + char path[PATH_MAX]; + const char *sysfs = sysfs__mountpoint(); + + if (!sysfs) + return -1; + + if (snprintf(path, sizeof(path), "%s/%s", sysfs, entry) >= PATH_MAX) + return -1; + + return filename__write_int(path, value); +} diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 956c21127d1e..45605348461e 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -31,6 +31,8 @@ int filename__read_int(const char *filename, int *value); int filename__read_ull(const char *filename, unsigned long long *value); int filename__read_str(const char *filename, char **buf, size_t *sizep); +int filename__write_int(const char *filename, int value); + int procfs__read_str(const char *entry, char **buf, size_t *sizep); int sysctl__read_int(const char *sysctl, int *value); @@ -38,4 +40,6 @@ int sysfs__read_int(const char *entry, int *value); int sysfs__read_ull(const char *entry, unsigned long long *value); int sysfs__read_str(const char *entry, char **buf, size_t *sizep); int sysfs__read_bool(const char *entry, bool *value); + +int sysfs__write_int(const char *entry, int value); #endif /* __API_FS__ */ diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile index 3bc0ef9f8923..ed9ace59d112 100644 --- a/tools/lib/lockdep/Makefile +++ b/tools/lib/lockdep/Makefile @@ -79,6 +79,7 @@ INCLUDES = -I. -I./uinclude -I./include -I../../include $(CONFIG_INCLUDES) # Set compile option CFLAGS if not set elsewhere CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g CFLAGS += -fPIC +CFLAGS += -Wall override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ) @@ -100,7 +101,7 @@ include $(srctree)/tools/build/Makefile.include do_compile_shared_library = \ ($(print_shared_lib_compile) \ - $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so)) + $(CC) $(LDFLAGS) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='$(@F)';$(shell ln -sf $(@F) $(@D)/liblockdep.so)) do_build_static_lib = \ ($(print_static_lib_build) \ @@ -118,10 +119,10 @@ all_cmd: $(CMD_TARGETS) $(LIB_IN): force $(Q)$(MAKE) $(build)=liblockdep -liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN) +$(OUTPUT)liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN) $(Q)$(do_compile_shared_library) -liblockdep.a: $(LIB_IN) +$(OUTPUT)liblockdep.a: $(LIB_IN) $(Q)$(do_build_static_lib) tags: force @@ -149,7 +150,7 @@ install_lib: all_cmd install: install_lib clean: - $(RM) *.o *~ $(TARGETS) *.a *liblockdep*.so* $(VERSION_FILES) .*.d .*.cmd + $(RM) $(OUTPUT)*.o *~ $(TARGETS) $(OUTPUT)*.a $(OUTPUT)*liblockdep*.so* $(VERSION_FILES) $(OUTPUT).*.d $(OUTPUT).*.cmd $(RM) tags TAGS PHONY += force diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c index a0a2e3a266af..ced6d7443cea 100644 --- a/tools/lib/lockdep/lockdep.c +++ b/tools/lib/lockdep/lockdep.c @@ -1,8 +1,27 @@ #include <linux/lockdep.h> +#include <stdlib.h> /* Trivial API wrappers, we don't (yet) have RCU in user-space: */ #define hlist_for_each_entry_rcu hlist_for_each_entry #define hlist_add_head_rcu hlist_add_head #define hlist_del_rcu hlist_del +#define list_for_each_entry_rcu list_for_each_entry +#define list_add_tail_rcu list_add_tail + +u32 prandom_u32(void) +{ + /* Used only by lock_pin_lock() which is dead code */ + abort(); +} + +static struct new_utsname *init_utsname(void) +{ + static struct new_utsname n = (struct new_utsname) { + .release = "liblockdep", + .version = LIBLOCKDEP_VERSION, + }; + + return &n; +} #include "../../../kernel/locking/lockdep.c" diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c index 52844847569c..6a2d3c5d4e92 100644 --- a/tools/lib/lockdep/preload.c +++ b/tools/lib/lockdep/preload.c @@ -4,6 +4,7 @@ #include <dlfcn.h> #include <stdlib.h> #include <sysexits.h> +#include <unistd.h> #include "include/liblockdep/mutex.h" #include "../../include/linux/rbtree.h" @@ -122,8 +123,6 @@ static struct rb_node **__get_lock_node(void *lock, struct rb_node **parent) #define LIBLOCKDEP_STATIC_ENTRIES 1024 #endif -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) - static struct lock_lookup __locks[LIBLOCKDEP_STATIC_ENTRIES]; static int __locks_nr; @@ -149,7 +148,7 @@ static struct lock_lookup *alloc_lock(void) int idx = __locks_nr++; if (idx >= ARRAY_SIZE(__locks)) { - fprintf(stderr, + dprintf(STDERR_FILENO, "LOCKDEP error: insufficient LIBLOCKDEP_STATIC_ENTRIES\n"); exit(EX_UNAVAILABLE); } diff --git a/tools/lib/lockdep/rbtree.c b/tools/lib/lockdep/rbtree.c index f7f43033c8b7..297c304571f8 100644 --- a/tools/lib/lockdep/rbtree.c +++ b/tools/lib/lockdep/rbtree.c @@ -1 +1 @@ -#include "../../../lib/rbtree.c" +#include "../../lib/rbtree.c" diff --git a/tools/lib/lockdep/run_tests.sh b/tools/lib/lockdep/run_tests.sh index 1069d96248c1..f9b94098fc98 100755 --- a/tools/lib/lockdep/run_tests.sh +++ b/tools/lib/lockdep/run_tests.sh @@ -4,9 +4,9 @@ make &> /dev/null for i in `ls tests/*.c`; do testname=$(basename "$i" .c) - gcc -o tests/$testname -pthread -lpthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null + gcc -o tests/$testname -pthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null echo -ne "$testname... " - if [ $(timeout 1 ./tests/$testname | wc -l) -gt 0 ]; then + if [ $(timeout 1 ./tests/$testname 2>&1 | wc -l) -gt 0 ]; then echo "PASSED!" else echo "FAILED!" @@ -18,9 +18,9 @@ done for i in `ls tests/*.c`; do testname=$(basename "$i" .c) - gcc -o tests/$testname -pthread -lpthread -Iinclude $i &> /dev/null + gcc -o tests/$testname -pthread -Iinclude $i &> /dev/null echo -ne "(PRELOAD) $testname... " - if [ $(timeout 1 ./lockdep ./tests/$testname | wc -l) -gt 0 ]; then + if [ $(timeout 1 ./lockdep ./tests/$testname 2>&1 | wc -l) -gt 0 ]; then echo "PASSED!" else echo "FAILED!" diff --git a/tools/lib/lockdep/uinclude/asm/hash.h b/tools/lib/lockdep/uinclude/asm/hash.h deleted file mode 100644 index d82b170bb216..000000000000 --- a/tools/lib/lockdep/uinclude/asm/hash.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_GENERIC_HASH_H -#define __ASM_GENERIC_HASH_H - -/* Stub */ - -#endif /* __ASM_GENERIC_HASH_H */ diff --git a/tools/lib/lockdep/uinclude/asm/hweight.h b/tools/lib/lockdep/uinclude/asm/hweight.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/asm/hweight.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/asm/sections.h b/tools/lib/lockdep/uinclude/asm/sections.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/asm/sections.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/bitops.h b/tools/lib/lockdep/uinclude/linux/bitops.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/bitops.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h deleted file mode 100644 index fd3e56a83fc2..000000000000 --- a/tools/lib/lockdep/uinclude/linux/compiler.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _LIBLOCKDEP_LINUX_COMPILER_H_ -#define _LIBLOCKDEP_LINUX_COMPILER_H_ - -#define __used __attribute__((__unused__)) -#define unlikely -#define READ_ONCE(x) (x) -#define WRITE_ONCE(x, val) x=(val) -#define RCU_INIT_POINTER(p, v) p=(v) - -#endif diff --git a/tools/lib/lockdep/uinclude/linux/delay.h b/tools/lib/lockdep/uinclude/linux/delay.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/delay.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/ftrace.h b/tools/lib/lockdep/uinclude/linux/ftrace.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/ftrace.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/gfp.h b/tools/lib/lockdep/uinclude/linux/gfp.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/gfp.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/hash.h b/tools/lib/lockdep/uinclude/linux/hash.h deleted file mode 100644 index 0f8479858dc0..000000000000 --- a/tools/lib/lockdep/uinclude/linux/hash.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../../include/linux/hash.h" diff --git a/tools/lib/lockdep/uinclude/linux/interrupt.h b/tools/lib/lockdep/uinclude/linux/interrupt.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/interrupt.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/kernel.h b/tools/lib/lockdep/uinclude/linux/kernel.h deleted file mode 100644 index 276c7a8b2ed1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/kernel.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef _LIBLOCKDEP_LINUX_KERNEL_H_ -#define _LIBLOCKDEP_LINUX_KERNEL_H_ - -#include <linux/export.h> -#include <linux/types.h> -#include <linux/rcu.h> -#include <linux/hardirq.h> -#include <linux/kern_levels.h> - -#ifndef container_of -#define container_of(ptr, type, member) ({ \ - const typeof(((type *)0)->member) * __mptr = (ptr); \ - (type *)((char *)__mptr - offsetof(type, member)); }) -#endif - -#define max(x, y) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - (void) (&_max1 == &_max2); \ - _max1 > _max2 ? _max1 : _max2; }) - -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#define WARN_ON(x) (x) -#define WARN_ON_ONCE(x) (x) -#define likely(x) (x) -#define WARN(x, y...) (x) -#define uninitialized_var(x) x -#define __init -#define noinline -#define list_add_tail_rcu list_add_tail -#define list_for_each_entry_rcu list_for_each_entry -#define barrier() -#define synchronize_sched() - -#ifndef CALLER_ADDR0 -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#endif - -#ifndef _RET_IP_ -#define _RET_IP_ CALLER_ADDR0 -#endif - -#ifndef _THIS_IP_ -#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) -#endif - -#endif diff --git a/tools/lib/lockdep/uinclude/linux/linkage.h b/tools/lib/lockdep/uinclude/linux/linkage.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/linkage.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/list.h b/tools/lib/lockdep/uinclude/linux/list.h deleted file mode 100644 index 6e9ef31ed82e..000000000000 --- a/tools/lib/lockdep/uinclude/linux/list.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../../include/linux/list.h" diff --git a/tools/lib/lockdep/uinclude/linux/mutex.h b/tools/lib/lockdep/uinclude/linux/mutex.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/mutex.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/poison.h b/tools/lib/lockdep/uinclude/linux/poison.h deleted file mode 100644 index 0c27bdf14233..000000000000 --- a/tools/lib/lockdep/uinclude/linux/poison.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../../include/linux/poison.h" diff --git a/tools/lib/lockdep/uinclude/linux/prefetch.h b/tools/lib/lockdep/uinclude/linux/prefetch.h deleted file mode 100644 index d73fe6f850ac..000000000000 --- a/tools/lib/lockdep/uinclude/linux/prefetch.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _LIBLOCKDEP_LINUX_PREFETCH_H_ -#define _LIBLOCKDEP_LINUX_PREFETCH_H - -static inline void prefetch(void *a __attribute__((unused))) { } - -#endif diff --git a/tools/lib/lockdep/uinclude/linux/proc_fs.h b/tools/lib/lockdep/uinclude/linux/proc_fs.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/proc_fs.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h deleted file mode 100644 index c3759477379c..000000000000 --- a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h +++ /dev/null @@ -1,2 +0,0 @@ -#define __always_inline -#include "../../../include/linux/rbtree_augmented.h" diff --git a/tools/lib/lockdep/uinclude/linux/seq_file.h b/tools/lib/lockdep/uinclude/linux/seq_file.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/linux/seq_file.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/lib/lockdep/uinclude/linux/spinlock.h b/tools/lib/lockdep/uinclude/linux/spinlock.h deleted file mode 100644 index 68c1aa2bcba5..000000000000 --- a/tools/lib/lockdep/uinclude/linux/spinlock.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _LIBLOCKDEP_SPINLOCK_H_ -#define _LIBLOCKDEP_SPINLOCK_H_ - -#include <pthread.h> -#include <stdbool.h> - -#define arch_spinlock_t pthread_mutex_t -#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER - -static inline void arch_spin_lock(arch_spinlock_t *mutex) -{ - pthread_mutex_lock(mutex); -} - -static inline void arch_spin_unlock(arch_spinlock_t *mutex) -{ - pthread_mutex_unlock(mutex); -} - -static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) -{ - return true; -} - -#endif diff --git a/tools/lib/lockdep/uinclude/linux/stringify.h b/tools/lib/lockdep/uinclude/linux/stringify.h deleted file mode 100644 index 05dfcd1ac118..000000000000 --- a/tools/lib/lockdep/uinclude/linux/stringify.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _LIBLOCKDEP_LINUX_STRINGIFY_H_ -#define _LIBLOCKDEP_LINUX_STRINGIFY_H_ - -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#endif diff --git a/tools/lib/lockdep/uinclude/trace/events/lock.h b/tools/lib/lockdep/uinclude/trace/events/lock.h deleted file mode 100644 index fab00ff936d1..000000000000 --- a/tools/lib/lockdep/uinclude/trace/events/lock.h +++ /dev/null @@ -1,3 +0,0 @@ - -/* empty file */ - diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index b0b3007d3c9c..d157dee7a4ec 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -364,6 +364,42 @@ cyc_thresh Specifies how frequently CYC packets are produced - see cyc CYC packets are not requested by default. +pt Specifies pass-through which enables the 'branch' config term. + + The default config selects 'pt' if it is available, so a user will + never need to specify this term. + +branch Enable branch tracing. Branch tracing is enabled by default so to + disable branch tracing use 'branch=0'. + + The default config selects 'branch' if it is available. + +ptw Enable PTWRITE packets which are produced when a ptwrite instruction + is executed. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/ptwrite + + which contains "1" if the feature is supported and + "0" otherwise. + +fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet + provides the address of the ptwrite instruction. In the absence of + fup_on_ptw, the decoder will use the address of the previous branch + if branch tracing is enabled, otherwise the address will be zero. + Note that fup_on_ptw will work even when branch tracing is disabled. + +pwr_evt Enable power events. The power events provide information about + changes to the CPU C-state. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/power_event_trace + + which contains "1" if the feature is supported and + "0" otherwise. + new snapshot option ------------------- diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 6e6a8b22c859..721a447f046e 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -48,6 +48,39 @@ OPTIONS Ranges of CPUs are specified with -: 0-2. Default is to trace on all online CPUs. +-T:: +--trace-funcs=:: + Only trace functions given by the argument. Multiple functions + can be given by using this option more than once. The function + argument also can be a glob pattern. It will be passed to + 'set_ftrace_filter' in tracefs. + +-N:: +--notrace-funcs=:: + Do not trace functions given by the argument. Like -T option, + this can be used more than once to specify multiple functions + (or glob patterns). It will be passed to 'set_ftrace_notrace' + in tracefs. + +-G:: +--graph-funcs=:: + Set graph filter on the given function (or a glob pattern). + This is useful for the function_graph tracer only and enables + tracing for functions executed from the given function. + This can be used more than once to specify multiple functions. + It will be passed to 'set_graph_function' in tracefs. + +-g:: +--nograph-funcs=:: + Set graph notrace filter on the given function (or a glob pattern). + Like -G option, this is useful for the function_graph tracer only + and disables tracing for function executed from the given function. + This can be used more than once to specify multiple functions. + It will be passed to 'set_graph_notrace' in tracefs. + +-D:: +--graph-depth=:: + Set max depth for function graph tracer to follow SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 3517e204a2b3..e2468ed6a307 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -116,7 +116,7 @@ OPTIONS --fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, - srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, + srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff, callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace @@ -130,6 +130,14 @@ OPTIONS i.e., the specified fields apply to all event types if the type string is not given. + In addition to overriding fields, it is also possible to add or remove + fields from the defaults. For example + + -F -cpu,+insn + + removes the cpu field and adds the insn field. Adding/removing fields + cannot be mixed with normal overriding. + The arguments are processed in the order received. A later usage can reset a prior request. e.g.: @@ -203,6 +211,8 @@ OPTIONS is printed. This is the full execution path leading to the sample. This is only supported when the sample was recorded with perf record -b or -j any. + The brstackoff field will print an offset into a specific dso/binary. + -k:: --vmlinux=<file>:: vmlinux pathname diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index bd0e4417f2be..698076313606 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -239,6 +239,20 @@ taskset. --no-merge:: Do not merge results from same PMUs. +--smi-cost:: +Measure SMI cost if msr/aperf/ and msr/smi/ events are supported. + +During the measurement, the /sys/device/cpu/freeze_on_smi will be set to +freeze core counters on SMI. +The aperf counter will not be effected by the setting. +The cost of SMI can be measured by (aperf - unhalted core cycles). + +In practice, the percentages of SMI cycles is very useful for performance +oriented analysis. --metric_only will be applied by default. +The output is SMI cycles%, equals to (aperf - unhalted core cycles) / aperf + +Users who wants to get the actual value can apply --no-metric-only. + EXAMPLES -------- diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 1f4fbc9a3292..bdf0e87f9b29 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -61,7 +61,7 @@ endif # Disable it on all other architectures in case libdw unwind # support is detected in system. Add supported architectures # to the check. -ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm)) +ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm powerpc)) NO_LIBDW_DWARF_UNWIND := 1 endif diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 29361d9b635a..7ce3d1a25133 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -17,6 +17,7 @@ #include <api/fs/fs.h> #include <linux/bitops.h> +#include <linux/compiler.h> #include <linux/coresight-pmu.h> #include <linux/kernel.h> #include <linux/log2.h> @@ -202,19 +203,18 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, pr_debug2("%s snapshot size: %zu\n", CORESIGHT_ETM_PMU_NAME, opts->auxtrace_snapshot_size); - if (cs_etm_evsel) { - /* - * To obtain the auxtrace buffer file descriptor, the auxtrace - * event must come first. - */ - perf_evlist__to_front(evlist, cs_etm_evsel); - /* - * In the case of per-cpu mmaps, we need the CPU on the - * AUX event. - */ - if (!cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(cs_etm_evsel, CPU); - } + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace + * event must come first. + */ + perf_evlist__to_front(evlist, cs_etm_evsel); + + /* + * In the case of per-cpu mmaps, we need the CPU on the + * AUX event. + */ + if (!cpu_map__empty(cpus)) + perf_evsel__set_sample_bit(cs_etm_evsel, CPU); /* Add dummy event to keep tracking */ if (opts->full_auxtrace) { @@ -583,8 +583,7 @@ static FILE *cs_device__open_file(const char *name) } -static __attribute__((format(printf, 2, 3))) -int cs_device__print_file(const char *name, const char *fmt, ...) +static int __printf(2, 3) cs_device__print_file(const char *name, const char *fmt, ...) { va_list args; FILE *file; diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index 90ad64b231cd..2e6595310420 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -5,4 +5,6 @@ libperf-y += perf_regs.o libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_DWARF) += skip-callchain-idx.o + libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o +libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c new file mode 100644 index 000000000000..3a24b3c43273 --- /dev/null +++ b/tools/perf/arch/powerpc/util/unwind-libdw.c @@ -0,0 +1,73 @@ +#include <elfutils/libdwfl.h> +#include "../../util/unwind-libdw.h" +#include "../../util/perf_regs.h" +#include "../../util/event.h" + +/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils. */ +static const int special_regs[3][2] = { + { 65, PERF_REG_POWERPC_LINK }, + { 101, PERF_REG_POWERPC_XER }, + { 109, PERF_REG_POWERPC_CTR }, +}; + +bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) +{ + struct unwind_info *ui = arg; + struct regs_dump *user_regs = &ui->sample->user_regs; + Dwarf_Word dwarf_regs[32], dwarf_nip; + size_t i; + +#define REG(r) ({ \ + Dwarf_Word val = 0; \ + perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r); \ + val; \ +}) + + dwarf_regs[0] = REG(R0); + dwarf_regs[1] = REG(R1); + dwarf_regs[2] = REG(R2); + dwarf_regs[3] = REG(R3); + dwarf_regs[4] = REG(R4); + dwarf_regs[5] = REG(R5); + dwarf_regs[6] = REG(R6); + dwarf_regs[7] = REG(R7); + dwarf_regs[8] = REG(R8); + dwarf_regs[9] = REG(R9); + dwarf_regs[10] = REG(R10); + dwarf_regs[11] = REG(R11); + dwarf_regs[12] = REG(R12); + dwarf_regs[13] = REG(R13); + dwarf_regs[14] = REG(R14); + dwarf_regs[15] = REG(R15); + dwarf_regs[16] = REG(R16); + dwarf_regs[17] = REG(R17); + dwarf_regs[18] = REG(R18); + dwarf_regs[19] = REG(R19); + dwarf_regs[20] = REG(R20); + dwarf_regs[21] = REG(R21); + dwarf_regs[22] = REG(R22); + dwarf_regs[23] = REG(R23); + dwarf_regs[24] = REG(R24); + dwarf_regs[25] = REG(R25); + dwarf_regs[26] = REG(R26); + dwarf_regs[27] = REG(R27); + dwarf_regs[28] = REG(R28); + dwarf_regs[29] = REG(R29); + dwarf_regs[30] = REG(R30); + dwarf_regs[31] = REG(R31); + if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs)) + return false; + + dwarf_nip = REG(NIP); + dwfl_thread_state_register_pc(thread, dwarf_nip); + for (i = 0; i < ARRAY_SIZE(special_regs); i++) { + Dwarf_Word val = 0; + perf_reg_value(&val, user_regs, special_regs[i][1]); + if (!dwfl_thread_state_registers(thread, + special_regs[i][0], 1, + &val)) + return false; + } + + return true; +} diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index af2bce7a2cd6..781df40b2966 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -35,10 +35,6 @@ #define KiB_MASK(x) (KiB(x) - 1) #define MiB_MASK(x) (MiB(x) - 1) -#define INTEL_BTS_DFLT_SAMPLE_SIZE KiB(4) - -#define INTEL_BTS_MAX_SAMPLE_SIZE KiB(60) - struct intel_bts_snapshot_ref { void *ref_buf; size_t ref_offset; diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index f630de0206a1..9535be57033f 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -40,10 +40,6 @@ #define KiB_MASK(x) (KiB(x) - 1) #define MiB_MASK(x) (MiB(x) - 1) -#define INTEL_PT_DEFAULT_SAMPLE_SIZE KiB(4) - -#define INTEL_PT_MAX_SAMPLE_SIZE KiB(60) - #define INTEL_PT_PSB_PERIOD_NEAR 256 struct intel_pt_snapshot_ref { @@ -196,6 +192,7 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu) int psb_cyc, psb_periods, psb_period; int pos = 0; u64 config; + char c; pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc"); @@ -229,6 +226,10 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu) } } + if (perf_pmu__scan_file(intel_pt_pmu, "format/pt", "%c", &c) == 1 && + perf_pmu__scan_file(intel_pt_pmu, "format/branch", "%c", &c) == 1) + pos += scnprintf(buf + pos, sizeof(buf) - pos, ",pt,branch"); + pr_debug2("%s default config: %s\n", intel_pt_pmu->name, buf); intel_pt_parse_terms(&intel_pt_pmu->format, buf, &config); diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 27de0c8c5c19..469d65b21122 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -700,7 +700,7 @@ static inline uint32_t lfsr_32(uint32_t lfsr) * kernel (KSM, zero page, etc.) cannot optimize away RAM * accesses: */ -static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) +static inline u64 access_data(u64 *data, u64 val) { if (g->p.data_reads) val += *data; diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index 80668fa7556e..ece45582a48d 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -156,7 +156,7 @@ static int parse_config_arg(char *arg, char **var, char **value) int cmd_config(int argc, const char **argv) { - int i, ret = 0; + int i, ret = -1; struct perf_config_set *set; char *user_config = mkpath("%s/.perfconfig", getenv("HOME")); const char *config_filename; @@ -186,10 +186,8 @@ int cmd_config(int argc, const char **argv) * because of reinitializing with options config file location. */ set = perf_config_set__new(); - if (!set) { - ret = -1; + if (!set) goto out_err; - } switch (actions) { case ACTION_LIST: @@ -197,41 +195,54 @@ int cmd_config(int argc, const char **argv) pr_err("Error: takes no arguments\n"); parse_options_usage(config_usage, config_options, "l", 1); } else { - ret = show_config(set); - if (ret < 0) + if (show_config(set) < 0) { pr_err("Nothing configured, " "please check your %s \n", config_filename); + goto out_err; + } } break; default: - if (argc) { - for (i = 0; argv[i]; i++) { - char *var, *value; - char *arg = strdup(argv[i]); - - if (!arg) { - pr_err("%s: strdup failed\n", __func__); - ret = -1; - break; - } + if (!argc) { + usage_with_options(config_usage, config_options); + break; + } - if (parse_config_arg(arg, &var, &value) < 0) { - free(arg); - ret = -1; - break; - } + for (i = 0; argv[i]; i++) { + char *var, *value; + char *arg = strdup(argv[i]); + + if (!arg) { + pr_err("%s: strdup failed\n", __func__); + goto out_err; + } - if (value == NULL) - ret = show_spec_config(set, var); - else - ret = set_config(set, config_filename, var, value); + if (parse_config_arg(arg, &var, &value) < 0) { free(arg); + goto out_err; } - } else - usage_with_options(config_usage, config_options); + + if (value == NULL) { + if (show_spec_config(set, var) < 0) { + pr_err("%s is not configured: %s\n", + var, config_filename); + free(arg); + goto out_err; + } + } else { + if (set_config(set, config_filename, var, value) < 0) { + pr_err("Failed to set '%s=%s' on %s\n", + var, value, config_filename); + free(arg); + goto out_err; + } + } + free(arg); + } } - perf_config_set__delete(set); + ret = 0; out_err: + perf_config_set__delete(set); return ret; } diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 9e0b35cd0eea..dd26c62c9893 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -28,9 +28,19 @@ #define DEFAULT_TRACER "function_graph" struct perf_ftrace { - struct perf_evlist *evlist; - struct target target; - const char *tracer; + struct perf_evlist *evlist; + struct target target; + const char *tracer; + struct list_head filters; + struct list_head notrace; + struct list_head graph_funcs; + struct list_head nograph_funcs; + int graph_depth; +}; + +struct filter_entry { + struct list_head list; + char name[]; }; static bool done; @@ -61,6 +71,7 @@ static int __write_tracing_file(const char *name, const char *val, bool append) int fd, ret = -1; ssize_t size = strlen(val); int flags = O_WRONLY; + char errbuf[512]; file = get_tracing_file(name); if (!file) { @@ -75,14 +86,16 @@ static int __write_tracing_file(const char *name, const char *val, bool append) fd = open(file, flags); if (fd < 0) { - pr_debug("cannot open tracing file: %s\n", name); + pr_debug("cannot open tracing file: %s: %s\n", + name, str_error_r(errno, errbuf, sizeof(errbuf))); goto out; } if (write(fd, val, size) == size) ret = 0; else - pr_debug("write '%s' to tracing/%s failed\n", val, name); + pr_debug("write '%s' to tracing/%s failed: %s\n", + val, name, str_error_r(errno, errbuf, sizeof(errbuf))); close(fd); out: @@ -101,6 +114,7 @@ static int append_tracing_file(const char *name, const char *val) } static int reset_tracing_cpu(void); +static void reset_tracing_filters(void); static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) { @@ -116,6 +130,10 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) if (reset_tracing_cpu() < 0) return -1; + if (write_tracing_file("max_graph_depth", "0") < 0) + return -1; + + reset_tracing_filters(); return 0; } @@ -181,6 +199,68 @@ static int reset_tracing_cpu(void) return ret; } +static int __set_tracing_filter(const char *filter_file, struct list_head *funcs) +{ + struct filter_entry *pos; + + list_for_each_entry(pos, funcs, list) { + if (append_tracing_file(filter_file, pos->name) < 0) + return -1; + } + + return 0; +} + +static int set_tracing_filters(struct perf_ftrace *ftrace) +{ + int ret; + + ret = __set_tracing_filter("set_ftrace_filter", &ftrace->filters); + if (ret < 0) + return ret; + + ret = __set_tracing_filter("set_ftrace_notrace", &ftrace->notrace); + if (ret < 0) + return ret; + + ret = __set_tracing_filter("set_graph_function", &ftrace->graph_funcs); + if (ret < 0) + return ret; + + /* old kernels do not have this filter */ + __set_tracing_filter("set_graph_notrace", &ftrace->nograph_funcs); + + return ret; +} + +static void reset_tracing_filters(void) +{ + write_tracing_file("set_ftrace_filter", " "); + write_tracing_file("set_ftrace_notrace", " "); + write_tracing_file("set_graph_function", " "); + write_tracing_file("set_graph_notrace", " "); +} + +static int set_tracing_depth(struct perf_ftrace *ftrace) +{ + char buf[16]; + + if (ftrace->graph_depth == 0) + return 0; + + if (ftrace->graph_depth < 0) { + pr_err("invalid graph depth: %d\n", ftrace->graph_depth); + return -1; + } + + snprintf(buf, sizeof(buf), "%d", ftrace->graph_depth); + + if (write_tracing_file("max_graph_depth", buf) < 0) + return -1; + + return 0; +} + static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) { char *trace_file; @@ -223,11 +303,23 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) goto out_reset; } + if (set_tracing_filters(ftrace) < 0) { + pr_err("failed to set tracing filters\n"); + goto out_reset; + } + + if (set_tracing_depth(ftrace) < 0) { + pr_err("failed to set graph depth\n"); + goto out_reset; + } + if (write_tracing_file("current_tracer", ftrace->tracer) < 0) { pr_err("failed to set current_tracer to %s\n", ftrace->tracer); goto out_reset; } + setup_pager(); + trace_file = get_tracing_file("trace_pipe"); if (!trace_file) { pr_err("failed to open trace_pipe\n"); @@ -251,8 +343,6 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) goto out_close_fd; } - setup_pager(); - perf_evlist__start_workload(ftrace->evlist); while (!done) { @@ -307,6 +397,32 @@ static int perf_ftrace_config(const char *var, const char *value, void *cb) return -1; } +static int parse_filter_func(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + struct list_head *head = opt->value; + struct filter_entry *entry; + + entry = malloc(sizeof(*entry) + strlen(str) + 1); + if (entry == NULL) + return -ENOMEM; + + strcpy(entry->name, str); + list_add_tail(&entry->list, head); + + return 0; +} + +static void delete_filter_func(struct list_head *head) +{ + struct filter_entry *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, head, list) { + list_del(&pos->list); + free(pos); + } +} + int cmd_ftrace(int argc, const char **argv) { int ret; @@ -330,9 +446,24 @@ int cmd_ftrace(int argc, const char **argv) "system-wide collection from all CPUs"), OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu", "list of cpus to monitor"), + OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func", + "trace given functions only", parse_filter_func), + OPT_CALLBACK('N', "notrace-funcs", &ftrace.notrace, "func", + "do not trace given functions", parse_filter_func), + OPT_CALLBACK('G', "graph-funcs", &ftrace.graph_funcs, "func", + "Set graph filter on given functions", parse_filter_func), + OPT_CALLBACK('g', "nograph-funcs", &ftrace.nograph_funcs, "func", + "Set nograph filter on given functions", parse_filter_func), + OPT_INTEGER('D', "graph-depth", &ftrace.graph_depth, + "Max depth for function graph tracer"), OPT_END() }; + INIT_LIST_HEAD(&ftrace.filters); + INIT_LIST_HEAD(&ftrace.notrace); + INIT_LIST_HEAD(&ftrace.graph_funcs); + INIT_LIST_HEAD(&ftrace.nograph_funcs); + ret = perf_config(perf_ftrace_config, &ftrace); if (ret < 0) return -1; @@ -348,12 +479,14 @@ int cmd_ftrace(int argc, const char **argv) target__strerror(&ftrace.target, ret, errbuf, 512); pr_err("%s\n", errbuf); - return -EINVAL; + goto out_delete_filters; } ftrace.evlist = perf_evlist__new(); - if (ftrace.evlist == NULL) - return -ENOMEM; + if (ftrace.evlist == NULL) { + ret = -ENOMEM; + goto out_delete_filters; + } ret = perf_evlist__create_maps(ftrace.evlist, &ftrace.target); if (ret < 0) @@ -364,5 +497,11 @@ int cmd_ftrace(int argc, const char **argv) out_delete_evlist: perf_evlist__delete(ftrace.evlist); +out_delete_filters: + delete_filter_func(&ftrace.filters); + delete_filter_func(&ftrace.notrace); + delete_filter_func(&ftrace.graph_funcs); + delete_filter_func(&ftrace.nograph_funcs); + return ret; } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 4761b0d7fcb5..4bce7d8679cb 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -85,6 +85,7 @@ enum perf_output_field { PERF_OUTPUT_INSN = 1U << 21, PERF_OUTPUT_INSNLEN = 1U << 22, PERF_OUTPUT_BRSTACKINSN = 1U << 23, + PERF_OUTPUT_BRSTACKOFF = 1U << 24, }; struct output_option { @@ -115,6 +116,7 @@ struct output_option { {.str = "insn", .field = PERF_OUTPUT_INSN}, {.str = "insnlen", .field = PERF_OUTPUT_INSNLEN}, {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN}, + {.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF}, }; /* default set to maintain compatibility with current format */ @@ -298,10 +300,10 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, "selected.\n"); return -EINVAL; } - if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) { - pr_err("Display of DSO requested but neither sample IP nor " - "sample address\nis selected. Hence, no addresses to convert " - "to DSO.\n"); + if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR) && + !PRINT_FIELD(BRSTACK) && !PRINT_FIELD(BRSTACKSYM) && !PRINT_FIELD(BRSTACKOFF)) { + pr_err("Display of DSO requested but no address to convert. Select\n" + "sample IP, sample address, brstack, brstacksym, or brstackoff.\n"); return -EINVAL; } if (PRINT_FIELD(SRCLINE) && !PRINT_FIELD(IP)) { @@ -383,7 +385,7 @@ static int perf_session__check_output_opt(struct perf_session *session) */ if (!evsel && output[j].user_set && !output[j].wildcard_set) { pr_err("%s events do not exist. " - "Remove corresponding -f option to proceed.\n", + "Remove corresponding -F option to proceed.\n", event_type(j)); return -1; } @@ -514,18 +516,43 @@ mispred_str(struct branch_entry *br) return br->flags.predicted ? 'P' : 'M'; } -static void print_sample_brstack(struct perf_sample *sample) +static void print_sample_brstack(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr) { struct branch_stack *br = sample->branch_stack; - u64 i; + struct addr_location alf, alt; + u64 i, from, to; if (!(br && br->nr)) return; for (i = 0; i < br->nr; i++) { - printf(" 0x%"PRIx64"/0x%"PRIx64"/%c/%c/%c/%d ", - br->entries[i].from, - br->entries[i].to, + from = br->entries[i].from; + to = br->entries[i].to; + + if (PRINT_FIELD(DSO)) { + memset(&alf, 0, sizeof(alf)); + memset(&alt, 0, sizeof(alt)); + thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf); + thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt); + } + + printf("0x%"PRIx64, from); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alf.map, stdout); + printf(")"); + } + + printf("/0x%"PRIx64, to); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alt.map, stdout); + printf(")"); + } + + printf("/%c/%c/%c/%d ", mispred_str( br->entries + i), br->entries[i].flags.in_tx? 'X' : '-', br->entries[i].flags.abort? 'A' : '-', @@ -534,7 +561,8 @@ static void print_sample_brstack(struct perf_sample *sample) } static void print_sample_brstacksym(struct perf_sample *sample, - struct thread *thread) + struct thread *thread, + struct perf_event_attr *attr) { struct branch_stack *br = sample->branch_stack; struct addr_location alf, alt; @@ -559,8 +587,18 @@ static void print_sample_brstacksym(struct perf_sample *sample, alt.sym = map__find_symbol(alt.map, alt.addr); symbol__fprintf_symname_offs(alf.sym, &alf, stdout); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alf.map, stdout); + printf(")"); + } putchar('/'); symbol__fprintf_symname_offs(alt.sym, &alt, stdout); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alt.map, stdout); + printf(")"); + } printf("/%c/%c/%c/%d ", mispred_str( br->entries + i), br->entries[i].flags.in_tx? 'X' : '-', @@ -569,6 +607,51 @@ static void print_sample_brstacksym(struct perf_sample *sample, } } +static void print_sample_brstackoff(struct perf_sample *sample, + struct thread *thread, + struct perf_event_attr *attr) +{ + struct branch_stack *br = sample->branch_stack; + struct addr_location alf, alt; + u64 i, from, to; + + if (!(br && br->nr)) + return; + + for (i = 0; i < br->nr; i++) { + + memset(&alf, 0, sizeof(alf)); + memset(&alt, 0, sizeof(alt)); + from = br->entries[i].from; + to = br->entries[i].to; + + thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf); + if (alf.map && !alf.map->dso->adjust_symbols) + from = map__map_ip(alf.map, from); + + thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt); + if (alt.map && !alt.map->dso->adjust_symbols) + to = map__map_ip(alt.map, to); + + printf("0x%"PRIx64, from); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alf.map, stdout); + printf(")"); + } + printf("/0x%"PRIx64, to); + if (PRINT_FIELD(DSO)) { + printf("("); + map__fprintf_dsoname(alt.map, stdout); + printf(")"); + } + printf("/%c/%c/%c/%d ", + mispred_str(br->entries + i), + br->entries[i].flags.in_tx ? 'X' : '-', + br->entries[i].flags.abort ? 'A' : '-', + br->entries[i].flags.cycles); + } +} #define MAXBB 16384UL static int grab_bb(u8 *buffer, u64 start, u64 end, @@ -1187,9 +1270,11 @@ static void process_event(struct perf_script *script, print_sample_iregs(sample, attr); if (PRINT_FIELD(BRSTACK)) - print_sample_brstack(sample); + print_sample_brstack(sample, thread, attr); else if (PRINT_FIELD(BRSTACKSYM)) - print_sample_brstacksym(sample, thread); + print_sample_brstacksym(sample, thread, attr); + else if (PRINT_FIELD(BRSTACKOFF)) + print_sample_brstackoff(sample, thread, attr); if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) print_sample_bpf_output(sample); @@ -1727,6 +1812,7 @@ static int parse_output_fields(const struct option *opt __maybe_unused, int rc = 0; char *str = strdup(arg); int type = -1; + enum { DEFAULT, SET, ADD, REMOVE } change = DEFAULT; if (!str) return -ENOMEM; @@ -1772,6 +1858,10 @@ static int parse_output_fields(const struct option *opt __maybe_unused, goto out; } + /* Don't override defaults for +- */ + if (strchr(str, '+') || strchr(str, '-')) + goto parse; + if (output_set_by_user()) pr_warning("Overriding previous field request for all events.\n"); @@ -1782,13 +1872,30 @@ static int parse_output_fields(const struct option *opt __maybe_unused, } } +parse: for (tok = strtok_r(tok, ",", &strtok_saveptr); tok; tok = strtok_r(NULL, ",", &strtok_saveptr)) { + if (*tok == '+') { + if (change == SET) + goto out_badmix; + change = ADD; + tok++; + } else if (*tok == '-') { + if (change == SET) + goto out_badmix; + change = REMOVE; + tok++; + } else { + if (change != SET && change != DEFAULT) + goto out_badmix; + change = SET; + } + for (i = 0; i < imax; ++i) { if (strcmp(tok, all_output_options[i].str) == 0) break; } if (i == imax && strcmp(tok, "flags") == 0) { - print_flags = true; + print_flags = change == REMOVE ? false : true; continue; } if (i == imax) { @@ -1805,8 +1912,12 @@ static int parse_output_fields(const struct option *opt __maybe_unused, if (output[j].invalid_fields & all_output_options[i].field) { pr_warning("\'%s\' not valid for %s events. Ignoring.\n", all_output_options[i].str, event_type(j)); - } else - output[j].fields |= all_output_options[i].field; + } else { + if (change == REMOVE) + output[j].fields &= ~all_output_options[i].field; + else + output[j].fields |= all_output_options[i].field; + } } } else { if (output[type].invalid_fields & all_output_options[i].field) { @@ -1826,7 +1937,11 @@ static int parse_output_fields(const struct option *opt __maybe_unused, "Events will not be displayed.\n", event_type(type)); } } + goto out; +out_badmix: + fprintf(stderr, "Cannot mix +-field with overridden fields\n"); + rc = -EINVAL; out: free(str); return rc; @@ -2444,6 +2559,7 @@ int cmd_script(int argc, const char **argv) symbol__config_symfs), OPT_CALLBACK('F', "fields", NULL, "str", "comma separated output fields prepend with 'type:'. " + "+field to add and -field to remove." "Valid types: hw,sw,trace,raw. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," "addr,symoff,period,iregs,brstack,brstacksym,flags," diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ad9324d1daf9..324363054c3f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -86,6 +86,7 @@ #define DEFAULT_SEPARATOR " " #define CNTR_NOT_SUPPORTED "<not supported>" #define CNTR_NOT_COUNTED "<not counted>" +#define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi" static void print_counters(struct timespec *ts, int argc, const char **argv); @@ -122,6 +123,14 @@ static const char * topdown_attrs[] = { NULL, }; +static const char *smi_cost_attrs = { + "{" + "msr/aperf/," + "msr/smi/," + "cycles" + "}" +}; + static struct perf_evlist *evsel_list; static struct target target = { @@ -137,6 +146,8 @@ static bool null_run = false; static int detailed_run = 0; static bool transaction_run; static bool topdown_run = false; +static bool smi_cost = false; +static bool smi_reset = false; static bool big_num = true; static int big_num_opt = -1; static const char *csv_sep = NULL; @@ -1782,6 +1793,8 @@ static const struct option stat_options[] = { "Only print computed metrics. No raw values", enable_metric_only), OPT_BOOLEAN(0, "topdown", &topdown_run, "measure topdown level 1 statistics"), + OPT_BOOLEAN(0, "smi-cost", &smi_cost, + "measure SMI cost"), OPT_END() }; @@ -2160,6 +2173,39 @@ static int add_default_attributes(void) return 0; } + if (smi_cost) { + int smi; + + if (sysfs__read_int(FREEZE_ON_SMI_PATH, &smi) < 0) { + fprintf(stderr, "freeze_on_smi is not supported.\n"); + return -1; + } + + if (!smi) { + if (sysfs__write_int(FREEZE_ON_SMI_PATH, 1) < 0) { + fprintf(stderr, "Failed to set freeze_on_smi.\n"); + return -1; + } + smi_reset = true; + } + + if (pmu_have_event("msr", "aperf") && + pmu_have_event("msr", "smi")) { + if (!force_metric_only) + metric_only = true; + err = parse_events(evsel_list, smi_cost_attrs, NULL); + } else { + fprintf(stderr, "To measure SMI cost, it needs " + "msr/aperf/, msr/smi/ and cpu/cycles/ support\n"); + return -1; + } + if (err) { + fprintf(stderr, "Cannot set up SMI cost events\n"); + return -1; + } + return 0; + } + if (topdown_run) { char *str = NULL; bool warn = false; @@ -2742,6 +2788,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (smi_cost && smi_reset) + sysfs__write_int(FREEZE_ON_SMI_PATH, 0); + perf_evlist__delete(evsel_list); return status; } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 10b6362ca0bf..2bcfa46913c8 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -134,7 +134,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) return err; } - err = symbol__disassemble(sym, map, NULL, 0); + err = symbol__disassemble(sym, map, NULL, 0, NULL); if (err == 0) { out_assign: top->sym_filter_entry = he; diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h index bedf5d0ba9ff..c53a41f48b63 100644 --- a/tools/perf/jvmti/jvmti_agent.h +++ b/tools/perf/jvmti/jvmti_agent.h @@ -5,8 +5,6 @@ #include <stdint.h> #include <jvmti.h> -#define __unused __attribute__((unused)) - #if defined(__cplusplus) extern "C" { #endif diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c index 5612641c69b4..6d710904c837 100644 --- a/tools/perf/jvmti/libjvmti.c +++ b/tools/perf/jvmti/libjvmti.c @@ -1,3 +1,4 @@ +#include <linux/compiler.h> #include <sys/types.h> #include <stdio.h> #include <string.h> @@ -238,7 +239,7 @@ code_generated_cb(jvmtiEnv *jvmti, } JNIEXPORT jint JNICALL -Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused) +Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __maybe_unused) { jvmtiEventCallbacks cb; jvmtiCapabilities caps1; @@ -313,7 +314,7 @@ Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused) } JNIEXPORT void JNICALL -Agent_OnUnload(JavaVM *jvm __unused) +Agent_OnUnload(JavaVM *jvm __maybe_unused) { int ret; diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index baa073f38334..bd0aabb2bd0f 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -48,10 +48,6 @@ #include "json.h" #include "jevents.h" -#ifndef __maybe_unused -#define __maybe_unused __attribute__((unused)) -#endif - int verbose; char *prog; diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c index 8ba2c4618fe9..39bbb97cd30a 100644 --- a/tools/perf/tests/bp_signal.c +++ b/tools/perf/tests/bp_signal.c @@ -62,8 +62,7 @@ static void __test_function(volatile long *ptr) } #endif -__attribute__ ((noinline)) -static int test_function(void) +static noinline int test_function(void) { __test_function(&the_var); the_var++; diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c index 89f92fa67cc4..3b1ac6f31b15 100644 --- a/tools/perf/tests/bp_signal_overflow.c +++ b/tools/perf/tests/bp_signal_overflow.c @@ -28,8 +28,7 @@ static int overflows; -__attribute__ ((noinline)) -static int test_function(void) +static noinline int test_function(void) { return time(NULL); } diff --git a/tools/perf/tests/bpf-script-test-prologue.c b/tools/perf/tests/bpf-script-test-prologue.c index 7230e62c70fc..b4ebc75e25ae 100644 --- a/tools/perf/tests/bpf-script-test-prologue.c +++ b/tools/perf/tests/bpf-script-test-prologue.c @@ -10,6 +10,15 @@ #include <uapi/linux/fs.h> +/* + * If CONFIG_PROFILE_ALL_BRANCHES is selected, + * 'if' is redefined after include kernel header. + * Recover 'if' for BPF object code. + */ +#ifdef if +# undef if +#endif + #define FMODE_READ 0x1 #define FMODE_WRITE 0x2 diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index dfe5c89e2049..3e56d08f7995 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -76,8 +76,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) return strcmp((const char *) symbol, funcs[idx]); } -__attribute__ ((noinline)) -static int unwind_thread(struct thread *thread) +static noinline int unwind_thread(struct thread *thread) { struct perf_sample sample; unsigned long cnt = 0; @@ -108,8 +107,7 @@ static int unwind_thread(struct thread *thread) static int global_unwind_retval = -INT_MAX; -__attribute__ ((noinline)) -static int compare(void *p1, void *p2) +static noinline int compare(void *p1, void *p2) { /* Any possible value should be 'thread' */ struct thread *thread = *(struct thread **)p1; @@ -128,8 +126,7 @@ static int compare(void *p1, void *p2) return p1 - p2; } -__attribute__ ((noinline)) -static int krava_3(struct thread *thread) +static noinline int krava_3(struct thread *thread) { struct thread *array[2] = {thread, thread}; void *fp = &bsearch; @@ -147,14 +144,12 @@ static int krava_3(struct thread *thread) return global_unwind_retval; } -__attribute__ ((noinline)) -static int krava_2(struct thread *thread) +static noinline int krava_2(struct thread *thread) { return krava_3(thread); } -__attribute__ ((noinline)) -static int krava_1(struct thread *thread) +static noinline int krava_1(struct thread *thread) { return krava_2(thread); } diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index d990ad08a3c6..27f41f28dcb4 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -46,12 +46,15 @@ static struct annotate_browser_opt { .jump_arrows = true, }; +struct arch; + struct annotate_browser { struct ui_browser b; struct rb_root entries; struct rb_node *curr_hot; struct disasm_line *selection; struct disasm_line **offsets; + struct arch *arch; int nr_events; u64 start; int nr_asm_entries; @@ -125,43 +128,57 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int int i, pcnt_width = annotate_browser__pcnt_width(ab); double percent_max = 0.0; char bf[256]; + bool show_title = false; for (i = 0; i < ab->nr_events; i++) { if (bdl->samples[i].percent > percent_max) percent_max = bdl->samples[i].percent; } + if ((row == 0) && (dl->offset == -1 || percent_max == 0.0)) { + if (ab->have_cycles) { + if (dl->ipc == 0.0 && dl->cycles == 0) + show_title = true; + } else + show_title = true; + } + if (dl->offset != -1 && percent_max != 0.0) { - if (percent_max != 0.0) { - for (i = 0; i < ab->nr_events; i++) { - ui_browser__set_percent_color(browser, - bdl->samples[i].percent, - current_entry); - if (annotate_browser__opts.show_total_period) { - ui_browser__printf(browser, "%6" PRIu64 " ", - bdl->samples[i].nr); - } else { - ui_browser__printf(browser, "%6.2f ", - bdl->samples[i].percent); - } + for (i = 0; i < ab->nr_events; i++) { + ui_browser__set_percent_color(browser, + bdl->samples[i].percent, + current_entry); + if (annotate_browser__opts.show_total_period) { + ui_browser__printf(browser, "%6" PRIu64 " ", + bdl->samples[i].nr); + } else { + ui_browser__printf(browser, "%6.2f ", + bdl->samples[i].percent); } - } else { - ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); } } else { ui_browser__set_percent_color(browser, 0, current_entry); - ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); + + if (!show_title) + ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); + else + ui_browser__printf(browser, "%*s", 7, "Percent"); } if (ab->have_cycles) { if (dl->ipc) ui_browser__printf(browser, "%*.2f ", IPC_WIDTH - 1, dl->ipc); - else + else if (!show_title) ui_browser__write_nstring(browser, " ", IPC_WIDTH); + else + ui_browser__printf(browser, "%*s ", IPC_WIDTH - 1, "IPC"); + if (dl->cycles) ui_browser__printf(browser, "%*" PRIu64 " ", CYCLES_WIDTH - 1, dl->cycles); - else + else if (!show_title) ui_browser__write_nstring(browser, " ", CYCLES_WIDTH); + else + ui_browser__printf(browser, "%*s ", CYCLES_WIDTH - 1, "Cycle"); } SLsmg_write_char(' '); @@ -1056,7 +1073,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, (nr_pcnt - 1); } - err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), sizeof_bdl); + err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), + sizeof_bdl, &browser.arch); if (err) { char msg[BUFSIZ]; symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg)); diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index e99ba86158d2..d903fd493416 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -168,7 +168,8 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map, if (map->dso->annotate_warned) return -1; - err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), 0); + err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), + 0, NULL); if (err) { char msg[BUFSIZ]; symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg)); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ddbd56df9187..be1caabb9290 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1379,7 +1379,9 @@ static const char *annotate__norm_arch(const char *arch_name) return normalize_arch((char *)arch_name); } -int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_name, size_t privsize) +int symbol__disassemble(struct symbol *sym, struct map *map, + const char *arch_name, size_t privsize, + struct arch **parch) { struct dso *dso = map->dso; char command[PATH_MAX * 2]; @@ -1405,6 +1407,9 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na if (arch == NULL) return -ENOTSUP; + if (parch) + *parch = arch; + if (arch->init) { err = arch->init(arch); if (err) { @@ -1901,7 +1906,8 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, struct rb_root source_line = RB_ROOT; u64 len; - if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), 0) < 0) + if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), + 0, NULL) < 0) return -1; len = symbol__size(sym); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 948aa8e6fd39..21055034aedd 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -158,7 +158,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 addr); int symbol__alloc_hist(struct symbol *sym); void symbol__annotate_zero_histograms(struct symbol *sym); -int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_name, size_t privsize); +int symbol__disassemble(struct symbol *sym, struct map *map, + const char *arch_name, size_t privsize, + struct arch **parch); enum symbol_disassemble_errno { SYMBOL_ANNOTATE_ERRNO__SUCCESS = 0, diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h index 0328f297a748..0175765c05b9 100644 --- a/tools/perf/util/cache.h +++ b/tools/perf/util/cache.h @@ -5,6 +5,7 @@ #include <subcmd/pager.h> #include "../ui/ui.h" +#include <linux/compiler.h> #include <linux/string.h> #define CMD_EXEC_PATH "--exec-path" @@ -24,6 +25,6 @@ static inline int is_absolute_path(const char *path) return path[0] == '/'; } -char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +char *mkpath(const char *fmt, ...) __printf(1, 2); #endif /* __PERF_CACHE_H */ diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 8a23ea1a71c7..c818bdb1c1ab 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -4,6 +4,7 @@ #include <stdbool.h> #include <string.h> +#include <linux/compiler.h> #include "event.h" #include "../ui/helpline.h" #include "../ui/progress.h" @@ -40,16 +41,16 @@ extern int debug_data_convert; #define STRERR_BUFSIZE 128 /* For the buffer size of str_error_r */ -int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +int dump_printf(const char *fmt, ...) __printf(1, 2); void trace_event(union perf_event *event); -int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2))); -int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2))); +int ui__error(const char *format, ...) __printf(1, 2); +int ui__warning(const char *format, ...) __printf(1, 2); void pr_stat(const char *fmt, ...); -int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4))); -int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5))); +int eprintf(int level, int var, const char *fmt, ...) __printf(3, 4); +int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __printf(4, 5); int veprintf(int level, int var, const char *fmt, va_list args); int perf_debug_option(const char *str); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 94cea4398a13..8d601fbdd8d6 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -1,6 +1,7 @@ #ifndef __PERF_EVLIST_H #define __PERF_EVLIST_H 1 +#include <linux/compiler.h> #include <linux/kernel.h> #include <linux/refcount.h> #include <linux/list.h> @@ -34,7 +35,7 @@ struct perf_mmap { refcount_t refcnt; u64 prev; struct auxtrace_mmap auxtrace_mmap; - char event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8))); + char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; static inline size_t diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cda44b0e821c..6f4882f8d61f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -11,13 +11,17 @@ #include <errno.h> #include <inttypes.h> #include <linux/bitops.h> +#include <api/fs/fs.h> #include <api/fs/tracing_path.h> #include <traceevent/event-parse.h> #include <linux/hw_breakpoint.h> #include <linux/perf_event.h> +#include <linux/compiler.h> #include <linux/err.h> #include <sys/ioctl.h> #include <sys/resource.h> +#include <sys/types.h> +#include <dirent.h> #include "asm/bug.h" #include "callchain.h" #include "cgroup.h" @@ -1441,7 +1445,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, } static int __open_attr__fprintf(FILE *fp, const char *name, const char *val, - void *priv __attribute__((unused))) + void *priv __maybe_unused) { return fprintf(fp, " %-32s %s\n", name, val); } @@ -2471,6 +2475,42 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err, return false; } +static bool find_process(const char *name) +{ + size_t len = strlen(name); + DIR *dir; + struct dirent *d; + int ret = -1; + + dir = opendir(procfs__mountpoint()); + if (!dir) + return false; + + /* Walk through the directory. */ + while (ret && (d = readdir(dir)) != NULL) { + char path[PATH_MAX]; + char *data; + size_t size; + + if ((d->d_type != DT_DIR) || + !strcmp(".", d->d_name) || + !strcmp("..", d->d_name)) + continue; + + scnprintf(path, sizeof(path), "%s/%s/comm", + procfs__mountpoint(), d->d_name); + + if (filename__read_str(path, &data, &size)) + continue; + + ret = strncmp(name, data, len); + free(data); + } + + closedir(dir); + return ret ? false : true; +} + int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, int err, char *msg, size_t size) { diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c index 5980f7d256b1..40789d8603d0 100644 --- a/tools/perf/util/genelf_debug.c +++ b/tools/perf/util/genelf_debug.c @@ -11,6 +11,7 @@ * @remark Copyright 2007 OProfile authors * @author Philippe Elie */ +#include <linux/compiler.h> #include <sys/types.h> #include <stdio.h> #include <getopt.h> @@ -125,7 +126,7 @@ struct debug_line_header { * and filesize, last entry is followed by en empty string. */ /* follow the first program statement */ -} __attribute__((packed)); +} __packed; /* DWARF 2 spec talk only about one possible compilation unit header while * binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not @@ -138,7 +139,7 @@ struct compilation_unit_header { uhalf version; uword debug_abbrev_offset; ubyte pointer_size; -} __attribute__((packed)); +} __packed; #define DW_LNS_num_opcode (DW_LNS_set_isa + 1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index b5baff3007bb..76ed7d03e500 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -8,6 +8,7 @@ #include <unistd.h> #include <stdio.h> #include <stdlib.h> +#include <linux/compiler.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/bitops.h> @@ -1274,7 +1275,7 @@ error: } static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val, - void *priv __attribute__((unused))) + void *priv __maybe_unused) { return fprintf(fp, ", %s = %s", name, val); } diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index b2834ac7b1f5..218ee2bac9a5 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -866,8 +866,6 @@ static void intel_bts_print_info(u64 *arr, int start, int finish) fprintf(stdout, intel_bts_info_fmts[i], arr[i]); } -u64 intel_bts_auxtrace_info_priv[INTEL_BTS_AUXTRACE_PRIV_SIZE]; - int intel_bts_process_auxtrace_info(union perf_event *event, struct perf_session *session) { diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 7cf7f7aca4d2..5dea06289db5 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -64,6 +64,25 @@ enum intel_pt_pkt_state { INTEL_PT_STATE_FUP_NO_TIP, }; +static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state) +{ + switch (pkt_state) { + case INTEL_PT_STATE_NO_PSB: + case INTEL_PT_STATE_NO_IP: + case INTEL_PT_STATE_ERR_RESYNC: + case INTEL_PT_STATE_IN_SYNC: + case INTEL_PT_STATE_TNT: + return true; + case INTEL_PT_STATE_TIP: + case INTEL_PT_STATE_TIP_PGD: + case INTEL_PT_STATE_FUP: + case INTEL_PT_STATE_FUP_NO_TIP: + return false; + default: + return true; + }; +} + #ifdef INTEL_PT_STRICT #define INTEL_PT_STATE_ERR1 INTEL_PT_STATE_NO_PSB #define INTEL_PT_STATE_ERR2 INTEL_PT_STATE_NO_PSB @@ -87,11 +106,13 @@ struct intel_pt_decoder { const unsigned char *buf; size_t len; bool return_compression; + bool branch_enable; bool mtc_insn; bool pge; bool have_tma; bool have_cyc; bool fixup_last_mtc; + bool have_last_ip; uint64_t pos; uint64_t last_ip; uint64_t ip; @@ -99,6 +120,7 @@ struct intel_pt_decoder { uint64_t timestamp; uint64_t tsc_timestamp; uint64_t ref_timestamp; + uint64_t sample_timestamp; uint64_t ret_addr; uint64_t ctc_timestamp; uint64_t ctc_delta; @@ -119,6 +141,7 @@ struct intel_pt_decoder { int pkt_len; int last_packet_type; unsigned int cbr; + unsigned int cbr_seen; unsigned int max_non_turbo_ratio; double max_non_turbo_ratio_fp; double cbr_cyc_to_tsc; @@ -136,9 +159,18 @@ struct intel_pt_decoder { bool continuous_period; bool overflow; bool set_fup_tx_flags; + bool set_fup_ptw; + bool set_fup_mwait; + bool set_fup_pwre; + bool set_fup_exstop; unsigned int fup_tx_flags; unsigned int tx_flags; + uint64_t fup_ptw_payload; + uint64_t fup_mwait_payload; + uint64_t fup_pwre_payload; + uint64_t cbr_payload; uint64_t timestamp_insn_cnt; + uint64_t sample_insn_cnt; uint64_t stuck_ip; int no_progress; int stuck_ip_prd; @@ -192,6 +224,7 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params) decoder->pgd_ip = params->pgd_ip; decoder->data = params->data; decoder->return_compression = params->return_compression; + decoder->branch_enable = params->branch_enable; decoder->period = params->period; decoder->period_type = params->period_type; @@ -398,6 +431,7 @@ static uint64_t intel_pt_calc_ip(const struct intel_pt_pkt *packet, static inline void intel_pt_set_last_ip(struct intel_pt_decoder *decoder) { decoder->last_ip = intel_pt_calc_ip(&decoder->packet, decoder->last_ip); + decoder->have_last_ip = true; } static inline void intel_pt_set_ip(struct intel_pt_decoder *decoder) @@ -635,6 +669,8 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info) case INTEL_PT_PAD: case INTEL_PT_VMCS: case INTEL_PT_MNT: + case INTEL_PT_PTWRITE: + case INTEL_PT_PTWRITE_IP: return 0; case INTEL_PT_MTC: @@ -733,6 +769,11 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info) case INTEL_PT_TIP_PGD: case INTEL_PT_TRACESTOP: + case INTEL_PT_EXSTOP: + case INTEL_PT_EXSTOP_IP: + case INTEL_PT_MWAIT: + case INTEL_PT_PWRE: + case INTEL_PT_PWRX: case INTEL_PT_OVF: case INTEL_PT_BAD: /* Does not happen */ default: @@ -898,6 +939,7 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder, decoder->tot_insn_cnt += insn_cnt; decoder->timestamp_insn_cnt += insn_cnt; + decoder->sample_insn_cnt += insn_cnt; decoder->period_insn_cnt += insn_cnt; if (err) { @@ -990,6 +1032,57 @@ out_no_progress: return err; } +static bool intel_pt_fup_event(struct intel_pt_decoder *decoder) +{ + bool ret = false; + + if (decoder->set_fup_tx_flags) { + decoder->set_fup_tx_flags = false; + decoder->tx_flags = decoder->fup_tx_flags; + decoder->state.type = INTEL_PT_TRANSACTION; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.flags = decoder->fup_tx_flags; + return true; + } + if (decoder->set_fup_ptw) { + decoder->set_fup_ptw = false; + decoder->state.type = INTEL_PT_PTW; + decoder->state.flags |= INTEL_PT_FUP_IP; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.ptw_payload = decoder->fup_ptw_payload; + return true; + } + if (decoder->set_fup_mwait) { + decoder->set_fup_mwait = false; + decoder->state.type = INTEL_PT_MWAIT_OP; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.mwait_payload = decoder->fup_mwait_payload; + ret = true; + } + if (decoder->set_fup_pwre) { + decoder->set_fup_pwre = false; + decoder->state.type |= INTEL_PT_PWR_ENTRY; + decoder->state.type &= ~INTEL_PT_BRANCH; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.pwre_payload = decoder->fup_pwre_payload; + ret = true; + } + if (decoder->set_fup_exstop) { + decoder->set_fup_exstop = false; + decoder->state.type |= INTEL_PT_EX_STOP; + decoder->state.type &= ~INTEL_PT_BRANCH; + decoder->state.flags |= INTEL_PT_FUP_IP; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + ret = true; + } + return ret; +} + static int intel_pt_walk_fup(struct intel_pt_decoder *decoder) { struct intel_pt_insn intel_pt_insn; @@ -1003,15 +1096,8 @@ static int intel_pt_walk_fup(struct intel_pt_decoder *decoder) if (err == INTEL_PT_RETURN) return 0; if (err == -EAGAIN) { - if (decoder->set_fup_tx_flags) { - decoder->set_fup_tx_flags = false; - decoder->tx_flags = decoder->fup_tx_flags; - decoder->state.type = INTEL_PT_TRANSACTION; - decoder->state.from_ip = decoder->ip; - decoder->state.to_ip = 0; - decoder->state.flags = decoder->fup_tx_flags; + if (intel_pt_fup_event(decoder)) return 0; - } return err; } decoder->set_fup_tx_flags = false; @@ -1360,7 +1446,9 @@ static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder) static void intel_pt_calc_cbr(struct intel_pt_decoder *decoder) { - unsigned int cbr = decoder->packet.payload; + unsigned int cbr = decoder->packet.payload & 0xff; + + decoder->cbr_payload = decoder->packet.payload; if (decoder->cbr == cbr) return; @@ -1417,6 +1505,13 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) case INTEL_PT_TRACESTOP: case INTEL_PT_BAD: case INTEL_PT_PSB: + case INTEL_PT_PTWRITE: + case INTEL_PT_PTWRITE_IP: + case INTEL_PT_EXSTOP: + case INTEL_PT_EXSTOP_IP: + case INTEL_PT_MWAIT: + case INTEL_PT_PWRE: + case INTEL_PT_PWRX: decoder->have_tma = false; intel_pt_log("ERROR: Unexpected packet\n"); return -EAGAIN; @@ -1446,7 +1541,8 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) case INTEL_PT_FUP: decoder->pge = true; - intel_pt_set_last_ip(decoder); + if (decoder->packet.count) + intel_pt_set_last_ip(decoder); break; case INTEL_PT_MODE_TSX: @@ -1497,6 +1593,13 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) case INTEL_PT_MODE_TSX: case INTEL_PT_BAD: case INTEL_PT_PSBEND: + case INTEL_PT_PTWRITE: + case INTEL_PT_PTWRITE_IP: + case INTEL_PT_EXSTOP: + case INTEL_PT_EXSTOP_IP: + case INTEL_PT_MWAIT: + case INTEL_PT_PWRE: + case INTEL_PT_PWRX: intel_pt_log("ERROR: Missing TIP after FUP\n"); decoder->pkt_state = INTEL_PT_STATE_ERR3; return -ENOENT; @@ -1625,6 +1728,15 @@ next: break; } intel_pt_set_last_ip(decoder); + if (!decoder->branch_enable) { + decoder->ip = decoder->last_ip; + if (intel_pt_fup_event(decoder)) + return 0; + no_tip = false; + break; + } + if (decoder->set_fup_mwait) + no_tip = true; err = intel_pt_walk_fup(decoder); if (err != -EAGAIN) { if (err) @@ -1650,6 +1762,8 @@ next: break; case INTEL_PT_PSB: + decoder->last_ip = 0; + decoder->have_last_ip = true; intel_pt_clear_stack(&decoder->stack); err = intel_pt_walk_psbend(decoder); if (err == -EAGAIN) @@ -1696,6 +1810,16 @@ next: case INTEL_PT_CBR: intel_pt_calc_cbr(decoder); + if (!decoder->branch_enable && + decoder->cbr != decoder->cbr_seen) { + decoder->cbr_seen = decoder->cbr; + decoder->state.type = INTEL_PT_CBR_CHG; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.cbr_payload = + decoder->packet.payload; + return 0; + } break; case INTEL_PT_MODE_EXEC: @@ -1722,6 +1846,71 @@ next: case INTEL_PT_PAD: break; + case INTEL_PT_PTWRITE_IP: + decoder->fup_ptw_payload = decoder->packet.payload; + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + if (decoder->packet.type == INTEL_PT_FUP) { + decoder->set_fup_ptw = true; + no_tip = true; + } else { + intel_pt_log_at("ERROR: Missing FUP after PTWRITE", + decoder->pos); + } + goto next; + + case INTEL_PT_PTWRITE: + decoder->state.type = INTEL_PT_PTW; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.ptw_payload = decoder->packet.payload; + return 0; + + case INTEL_PT_MWAIT: + decoder->fup_mwait_payload = decoder->packet.payload; + decoder->set_fup_mwait = true; + break; + + case INTEL_PT_PWRE: + if (decoder->set_fup_mwait) { + decoder->fup_pwre_payload = + decoder->packet.payload; + decoder->set_fup_pwre = true; + break; + } + decoder->state.type = INTEL_PT_PWR_ENTRY; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.pwrx_payload = decoder->packet.payload; + return 0; + + case INTEL_PT_EXSTOP_IP: + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + if (decoder->packet.type == INTEL_PT_FUP) { + decoder->set_fup_exstop = true; + no_tip = true; + } else { + intel_pt_log_at("ERROR: Missing FUP after EXSTOP", + decoder->pos); + } + goto next; + + case INTEL_PT_EXSTOP: + decoder->state.type = INTEL_PT_EX_STOP; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + return 0; + + case INTEL_PT_PWRX: + decoder->state.type = INTEL_PT_PWR_EXIT; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.pwrx_payload = decoder->packet.payload; + return 0; + default: return intel_pt_bug(decoder); } @@ -1730,8 +1919,9 @@ next: static inline bool intel_pt_have_ip(struct intel_pt_decoder *decoder) { - return decoder->last_ip || decoder->packet.count == 0 || - decoder->packet.count == 3 || decoder->packet.count == 6; + return decoder->packet.count && + (decoder->have_last_ip || decoder->packet.count == 3 || + decoder->packet.count == 6); } /* Walk PSB+ packets to get in sync. */ @@ -1750,6 +1940,13 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) __fallthrough; case INTEL_PT_TIP_PGE: case INTEL_PT_TIP: + case INTEL_PT_PTWRITE: + case INTEL_PT_PTWRITE_IP: + case INTEL_PT_EXSTOP: + case INTEL_PT_EXSTOP_IP: + case INTEL_PT_MWAIT: + case INTEL_PT_PWRE: + case INTEL_PT_PWRX: intel_pt_log("ERROR: Unexpected packet\n"); return -ENOENT; @@ -1854,14 +2051,10 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) break; case INTEL_PT_FUP: - if (decoder->overflow) { - if (intel_pt_have_ip(decoder)) - intel_pt_set_ip(decoder); - if (decoder->ip) - return 0; - } - if (decoder->packet.count) - intel_pt_set_last_ip(decoder); + if (intel_pt_have_ip(decoder)) + intel_pt_set_ip(decoder); + if (decoder->ip) + return 0; break; case INTEL_PT_MTC: @@ -1910,6 +2103,9 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) break; case INTEL_PT_PSB: + decoder->last_ip = 0; + decoder->have_last_ip = true; + intel_pt_clear_stack(&decoder->stack); err = intel_pt_walk_psb(decoder); if (err) return err; @@ -1925,6 +2121,13 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) case INTEL_PT_VMCS: case INTEL_PT_MNT: case INTEL_PT_PAD: + case INTEL_PT_PTWRITE: + case INTEL_PT_PTWRITE_IP: + case INTEL_PT_EXSTOP: + case INTEL_PT_EXSTOP_IP: + case INTEL_PT_MWAIT: + case INTEL_PT_PWRE: + case INTEL_PT_PWRX: default: break; } @@ -1935,6 +2138,19 @@ static int intel_pt_sync_ip(struct intel_pt_decoder *decoder) { int err; + decoder->set_fup_tx_flags = false; + decoder->set_fup_ptw = false; + decoder->set_fup_mwait = false; + decoder->set_fup_pwre = false; + decoder->set_fup_exstop = false; + + if (!decoder->branch_enable) { + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + decoder->overflow = false; + decoder->state.type = 0; /* Do not have a sample */ + return 0; + } + intel_pt_log("Scanning for full IP\n"); err = intel_pt_walk_to_ip(decoder); if (err) @@ -2043,6 +2259,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder) decoder->pge = false; decoder->continuous_period = false; + decoder->have_last_ip = false; decoder->last_ip = 0; decoder->ip = 0; intel_pt_clear_stack(&decoder->stack); @@ -2051,6 +2268,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder) if (err) return err; + decoder->have_last_ip = true; decoder->pkt_state = INTEL_PT_STATE_NO_IP; err = intel_pt_walk_psb(decoder); @@ -2069,7 +2287,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder) static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder) { - uint64_t est = decoder->timestamp_insn_cnt << 1; + uint64_t est = decoder->sample_insn_cnt << 1; if (!decoder->cbr || !decoder->max_non_turbo_ratio) goto out; @@ -2077,7 +2295,7 @@ static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder) est *= decoder->max_non_turbo_ratio; est /= decoder->cbr; out: - return decoder->timestamp + est; + return decoder->sample_timestamp + est; } const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) @@ -2093,8 +2311,10 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) err = intel_pt_sync(decoder); break; case INTEL_PT_STATE_NO_IP: + decoder->have_last_ip = false; decoder->last_ip = 0; - /* Fall through */ + decoder->ip = 0; + __fallthrough; case INTEL_PT_STATE_ERR_RESYNC: err = intel_pt_sync_ip(decoder); break; @@ -2130,15 +2350,29 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) } } while (err == -ENOLINK); - decoder->state.err = err ? intel_pt_ext_err(err) : 0; - decoder->state.timestamp = decoder->timestamp; + if (err) { + decoder->state.err = intel_pt_ext_err(err); + decoder->state.from_ip = decoder->ip; + decoder->sample_timestamp = decoder->timestamp; + decoder->sample_insn_cnt = decoder->timestamp_insn_cnt; + } else { + decoder->state.err = 0; + if (decoder->cbr != decoder->cbr_seen && decoder->state.type) { + decoder->cbr_seen = decoder->cbr; + decoder->state.type |= INTEL_PT_CBR_CHG; + decoder->state.cbr_payload = decoder->cbr_payload; + } + if (intel_pt_sample_time(decoder->pkt_state)) { + decoder->sample_timestamp = decoder->timestamp; + decoder->sample_insn_cnt = decoder->timestamp_insn_cnt; + } + } + + decoder->state.timestamp = decoder->sample_timestamp; decoder->state.est_timestamp = intel_pt_est_timestamp(decoder); decoder->state.cr3 = decoder->cr3; decoder->state.tot_insn_cnt = decoder->tot_insn_cnt; - if (err) - decoder->state.from_ip = decoder->ip; - return &decoder->state; } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index e90619a43c0c..921b22e8ca0e 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -25,11 +25,18 @@ #define INTEL_PT_IN_TX (1 << 0) #define INTEL_PT_ABORT_TX (1 << 1) #define INTEL_PT_ASYNC (1 << 2) +#define INTEL_PT_FUP_IP (1 << 3) enum intel_pt_sample_type { INTEL_PT_BRANCH = 1 << 0, INTEL_PT_INSTRUCTION = 1 << 1, INTEL_PT_TRANSACTION = 1 << 2, + INTEL_PT_PTW = 1 << 3, + INTEL_PT_MWAIT_OP = 1 << 4, + INTEL_PT_PWR_ENTRY = 1 << 5, + INTEL_PT_EX_STOP = 1 << 6, + INTEL_PT_PWR_EXIT = 1 << 7, + INTEL_PT_CBR_CHG = 1 << 8, }; enum intel_pt_period_type { @@ -63,6 +70,11 @@ struct intel_pt_state { uint64_t timestamp; uint64_t est_timestamp; uint64_t trace_nr; + uint64_t ptw_payload; + uint64_t mwait_payload; + uint64_t pwre_payload; + uint64_t pwrx_payload; + uint64_t cbr_payload; uint32_t flags; enum intel_pt_insn_op insn_op; int insn_len; @@ -87,6 +99,7 @@ struct intel_pt_params { bool (*pgd_ip)(uint64_t ip, void *data); void *data; bool return_compression; + bool branch_enable; uint64_t period; enum intel_pt_period_type period_type; unsigned max_non_turbo_ratio; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h index debe751dc3d6..45b64f93f358 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h @@ -16,6 +16,7 @@ #ifndef INCLUDE__INTEL_PT_LOG_H__ #define INCLUDE__INTEL_PT_LOG_H__ +#include <linux/compiler.h> #include <stdint.h> #include <inttypes.h> @@ -34,8 +35,7 @@ void __intel_pt_log_insn(struct intel_pt_insn *intel_pt_insn, uint64_t ip); void __intel_pt_log_insn_no_data(struct intel_pt_insn *intel_pt_insn, uint64_t ip); -__attribute__((format(printf, 1, 2))) -void __intel_pt_log(const char *fmt, ...); +void __intel_pt_log(const char *fmt, ...) __printf(1, 2); #define intel_pt_log(fmt, ...) \ do { \ diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c index 7528ae4f7e28..ba4c9dd18643 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c @@ -64,6 +64,13 @@ static const char * const packet_name[] = { [INTEL_PT_PIP] = "PIP", [INTEL_PT_OVF] = "OVF", [INTEL_PT_MNT] = "MNT", + [INTEL_PT_PTWRITE] = "PTWRITE", + [INTEL_PT_PTWRITE_IP] = "PTWRITE", + [INTEL_PT_EXSTOP] = "EXSTOP", + [INTEL_PT_EXSTOP_IP] = "EXSTOP", + [INTEL_PT_MWAIT] = "MWAIT", + [INTEL_PT_PWRE] = "PWRE", + [INTEL_PT_PWRX] = "PWRX", }; const char *intel_pt_pkt_name(enum intel_pt_pkt_type type) @@ -123,7 +130,7 @@ static int intel_pt_get_cbr(const unsigned char *buf, size_t len, if (len < 4) return INTEL_PT_NEED_MORE_BYTES; packet->type = INTEL_PT_CBR; - packet->payload = buf[2]; + packet->payload = le16_to_cpu(*(uint16_t *)(buf + 2)); return 4; } @@ -217,12 +224,80 @@ static int intel_pt_get_3byte(const unsigned char *buf, size_t len, } } +static int intel_pt_get_ptwrite(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + packet->count = (buf[1] >> 5) & 0x3; + packet->type = buf[1] & BIT(7) ? INTEL_PT_PTWRITE_IP : + INTEL_PT_PTWRITE; + + switch (packet->count) { + case 0: + if (len < 6) + return INTEL_PT_NEED_MORE_BYTES; + packet->payload = le32_to_cpu(*(uint32_t *)(buf + 2)); + return 6; + case 1: + if (len < 10) + return INTEL_PT_NEED_MORE_BYTES; + packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2)); + return 10; + default: + return INTEL_PT_BAD_PACKET; + } +} + +static int intel_pt_get_exstop(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_EXSTOP; + return 2; +} + +static int intel_pt_get_exstop_ip(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_EXSTOP_IP; + return 2; +} + +static int intel_pt_get_mwait(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 10) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_MWAIT; + packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2)); + return 10; +} + +static int intel_pt_get_pwre(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 4) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_PWRE; + memcpy_le64(&packet->payload, buf + 2, 2); + return 4; +} + +static int intel_pt_get_pwrx(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 7) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_PWRX; + memcpy_le64(&packet->payload, buf + 2, 5); + return 7; +} + static int intel_pt_get_ext(const unsigned char *buf, size_t len, struct intel_pt_pkt *packet) { if (len < 2) return INTEL_PT_NEED_MORE_BYTES; + if ((buf[1] & 0x1f) == 0x12) + return intel_pt_get_ptwrite(buf, len, packet); + switch (buf[1]) { case 0xa3: /* Long TNT */ return intel_pt_get_long_tnt(buf, len, packet); @@ -244,6 +319,16 @@ static int intel_pt_get_ext(const unsigned char *buf, size_t len, return intel_pt_get_tma(buf, len, packet); case 0xC3: /* 3-byte header */ return intel_pt_get_3byte(buf, len, packet); + case 0x62: /* EXSTOP no IP */ + return intel_pt_get_exstop(packet); + case 0xE2: /* EXSTOP with IP */ + return intel_pt_get_exstop_ip(packet); + case 0xC2: /* MWAIT */ + return intel_pt_get_mwait(buf, len, packet); + case 0x22: /* PWRE */ + return intel_pt_get_pwre(buf, len, packet); + case 0xA2: /* PWRX */ + return intel_pt_get_pwrx(buf, len, packet); default: return INTEL_PT_BAD_PACKET; } @@ -522,6 +607,29 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, ret = snprintf(buf, buf_len, "%s 0x%llx (NR=%d)", name, payload, nr); return ret; + case INTEL_PT_PTWRITE: + return snprintf(buf, buf_len, "%s 0x%llx IP:0", name, payload); + case INTEL_PT_PTWRITE_IP: + return snprintf(buf, buf_len, "%s 0x%llx IP:1", name, payload); + case INTEL_PT_EXSTOP: + return snprintf(buf, buf_len, "%s IP:0", name); + case INTEL_PT_EXSTOP_IP: + return snprintf(buf, buf_len, "%s IP:1", name); + case INTEL_PT_MWAIT: + return snprintf(buf, buf_len, "%s 0x%llx Hints 0x%x Extensions 0x%x", + name, payload, (unsigned int)(payload & 0xff), + (unsigned int)((payload >> 32) & 0x3)); + case INTEL_PT_PWRE: + return snprintf(buf, buf_len, "%s 0x%llx HW:%u CState:%u Sub-CState:%u", + name, payload, !!(payload & 0x80), + (unsigned int)((payload >> 12) & 0xf), + (unsigned int)((payload >> 8) & 0xf)); + case INTEL_PT_PWRX: + return snprintf(buf, buf_len, "%s 0x%llx Last CState:%u Deepest CState:%u Wake Reason 0x%x", + name, payload, + (unsigned int)((payload >> 4) & 0xf), + (unsigned int)(payload & 0xf), + (unsigned int)((payload >> 8) & 0xf)); default: break; } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h index 781bb79883bd..73ddc3a88d07 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h @@ -52,6 +52,13 @@ enum intel_pt_pkt_type { INTEL_PT_PIP, INTEL_PT_OVF, INTEL_PT_MNT, + INTEL_PT_PTWRITE, + INTEL_PT_PTWRITE_IP, + INTEL_PT_EXSTOP, + INTEL_PT_EXSTOP_IP, + INTEL_PT_MWAIT, + INTEL_PT_PWRE, + INTEL_PT_PWRX, }; struct intel_pt_pkt { diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 4c7718f87a08..6df836469f2b 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -668,6 +668,19 @@ static bool intel_pt_return_compression(struct intel_pt *pt) return true; } +static bool intel_pt_branch_enable(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + u64 config; + + evlist__for_each_entry(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, &config) && + (config & 1) && !(config & 0x2000)) + return false; + } + return true; +} + static unsigned int intel_pt_mtc_period(struct intel_pt *pt) { struct perf_evsel *evsel; @@ -799,6 +812,7 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, params.walk_insn = intel_pt_walk_next_insn; params.data = ptq; params.return_compression = intel_pt_return_compression(pt); + params.branch_enable = intel_pt_branch_enable(pt); params.max_non_turbo_ratio = pt->max_non_turbo_ratio; params.mtc_period = intel_pt_mtc_period(pt); params.tsc_ctc_ratio_n = pt->tsc_ctc_ratio_n; @@ -1308,18 +1322,14 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) ptq->have_sample = false; if (pt->sample_instructions && - (state->type & INTEL_PT_INSTRUCTION) && - (!pt->synth_opts.initial_skip || - pt->num_events++ >= pt->synth_opts.initial_skip)) { + (state->type & INTEL_PT_INSTRUCTION)) { err = intel_pt_synth_instruction_sample(ptq); if (err) return err; } if (pt->sample_transactions && - (state->type & INTEL_PT_TRANSACTION) && - (!pt->synth_opts.initial_skip || - pt->num_events++ >= pt->synth_opts.initial_skip)) { + (state->type & INTEL_PT_TRANSACTION)) { err = intel_pt_synth_transaction_sample(ptq); if (err) return err; @@ -2025,6 +2035,7 @@ static int intel_pt_synth_events(struct intel_pt *pt, return err; } pt->sample_transactions = true; + pt->transactions_sample_type = attr.sample_type; pt->transactions_id = id; id += 1; evlist__for_each_entry(evlist, evsel) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index ea7f450dc609..389e9729331f 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -2,6 +2,7 @@ #define __PMU_H #include <linux/bitmap.h> +#include <linux/compiler.h> #include <linux/perf_event.h> #include <stdbool.h> #include "evsel.h" @@ -83,8 +84,7 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet, bool long_desc, bool details_flag); bool pmu_have_event(const char *pname, const char *name); -int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, - ...) __attribute__((format(scanf, 3, 4))); +int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, ...) __scanf(3, 4); int perf_pmu__test(void); diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 373842656fb6..5812947418dd 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -1,6 +1,7 @@ #ifndef _PROBE_EVENT_H #define _PROBE_EVENT_H +#include <linux/compiler.h> #include <stdbool.h> #include "intlist.h" @@ -171,8 +172,7 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev, struct symbol *sym); /* If there is no space to write, returns -E2BIG. */ -int e_snprintf(char *str, size_t size, const char *format, ...) - __attribute__((format(printf, 3, 4))); +int e_snprintf(char *str, size_t size, const char *format, ...) __printf(3, 4); /* Maximum index number of event-name postfix */ #define MAX_EVENT_INDEX 1024 diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 40de3cb40d21..57b7a00e6f16 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -28,6 +28,7 @@ #include <stdbool.h> #include <errno.h> #include <linux/bitmap.h> +#include <linux/compiler.h> #include <linux/time64.h> #include "../../perf.h" @@ -84,7 +85,7 @@ struct tables { static struct tables tables_global; -static void handler_call_die(const char *handler_name) NORETURN; +static void handler_call_die(const char *handler_name) __noreturn; static void handler_call_die(const char *handler_name) { PyErr_Print(); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 7dc1096264c5..d19c40a81040 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2035,7 +2035,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, if (!(evsel->attr.sample_type & PERF_SAMPLE_CPU)) { pr_err("File does not contain CPU events. " - "Remove -c option to proceed.\n"); + "Remove -C option to proceed.\n"); return -1; } } diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index ac10cc675d39..719d6cb86952 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -44,6 +44,8 @@ static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS]; static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS]; static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS]; static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS]; +static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS]; +static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS]; static struct rblist runtime_saved_values; static bool have_frontend_stalled; @@ -157,6 +159,8 @@ void perf_stat__reset_shadow_stats(void) memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued)); memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles)); memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles)); + memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats)); + memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats)); next = rb_first(&runtime_saved_values.entries); while (next) { @@ -217,6 +221,10 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]); + else if (perf_stat_evsel__is(counter, SMI_NUM)) + update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]); + else if (perf_stat_evsel__is(counter, APERF)) + update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); if (counter->collect_stat) { struct saved_value *v = saved_value_lookup(counter, cpu, ctx, @@ -592,6 +600,29 @@ static double td_be_bound(int ctx, int cpu) return sanitize_val(1.0 - sum); } +static void print_smi_cost(int cpu, struct perf_evsel *evsel, + struct perf_stat_output_ctx *out) +{ + double smi_num, aperf, cycles, cost = 0.0; + int ctx = evsel_context(evsel); + const char *color = NULL; + + smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]); + aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]); + cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]); + + if ((cycles == 0) || (aperf == 0)) + return; + + if (smi_num) + cost = (aperf - cycles) / aperf * 100.00; + + if (cost > 10) + color = PERF_COLOR_RED; + out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost); + out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); +} + void perf_stat__print_shadow_stats(struct perf_evsel *evsel, double avg, int cpu, struct perf_stat_output_ctx *out) @@ -825,6 +856,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, } snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio); + } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { + print_smi_cost(cpu, evsel, out); } else { print_metric(ctxp, NULL, NULL, NULL, 0); } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index c58174443dc1..53b9a994a3dc 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -86,6 +86,8 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = { ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired), ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles), ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles), + ID(SMI_NUM, msr/smi/), + ID(APERF, msr/aperf/), }; #undef ID diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 0a65ae23f495..7522bf10b03e 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -22,6 +22,8 @@ enum perf_stat_evsel_id { PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED, PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES, PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES, + PERF_STAT_EVSEL_ID__SMI_NUM, + PERF_STAT_EVSEL_ID__APERF, PERF_STAT_EVSEL_ID__MAX, }; diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h index 318424ea561d..802d743378af 100644 --- a/tools/perf/util/strbuf.h +++ b/tools/perf/util/strbuf.h @@ -42,6 +42,7 @@ #include <stdarg.h> #include <stddef.h> #include <string.h> +#include <linux/compiler.h> #include <sys/types.h> extern char strbuf_slopbuf[]; @@ -85,8 +86,7 @@ static inline int strbuf_addstr(struct strbuf *sb, const char *s) { return strbuf_add(sb, s, strlen(s)); } -__attribute__((format(printf,2,3))) -int strbuf_addf(struct strbuf *sb, const char *fmt, ...); +int strbuf_addf(struct strbuf *sb, const char *fmt, ...) __printf(2, 3); /* XXX: if read fails, any partial read is undone */ ssize_t strbuf_read(struct strbuf *, int fd, ssize_t hint); diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c index 996046a66fe5..aacb65e079aa 100644 --- a/tools/perf/util/usage.c +++ b/tools/perf/util/usage.c @@ -16,13 +16,13 @@ static void report(const char *prefix, const char *err, va_list params) fprintf(stderr, " %s%s\n", prefix, msg); } -static NORETURN void usage_builtin(const char *err) +static __noreturn void usage_builtin(const char *err) { fprintf(stderr, "\n Usage: %s\n", err); exit(129); } -static NORETURN void die_builtin(const char *err, va_list params) +static __noreturn void die_builtin(const char *err, va_list params) { report(" Fatal: ", err, params); exit(128); @@ -40,7 +40,7 @@ static void warn_builtin(const char *warn, va_list params) /* If we are in a dlopen()ed .so write to a global variable would segfault * (ugh), so keep things static. */ -static void (*usage_routine)(const char *err) NORETURN = usage_builtin; +static void (*usage_routine)(const char *err) __noreturn = usage_builtin; static void (*error_routine)(const char *err, va_list params) = error_builtin; static void (*warn_routine)(const char *err, va_list params) = warn_builtin; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 28c9f335006c..988111e0bab5 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -343,43 +343,6 @@ int perf_event_paranoid(void) return value; } - -bool find_process(const char *name) -{ - size_t len = strlen(name); - DIR *dir; - struct dirent *d; - int ret = -1; - - dir = opendir(procfs__mountpoint()); - if (!dir) - return false; - - /* Walk through the directory. */ - while (ret && (d = readdir(dir)) != NULL) { - char path[PATH_MAX]; - char *data; - size_t size; - - if ((d->d_type != DT_DIR) || - !strcmp(".", d->d_name) || - !strcmp("..", d->d_name)) - continue; - - scnprintf(path, sizeof(path), "%s/%s/comm", - procfs__mountpoint(), d->d_name); - - if (filename__read_str(path, &data, &size)) - continue; - - ret = strncmp(name, data, len); - free(data); - } - - closedir(dir); - return ret ? false : true; -} - static int fetch_ubuntu_kernel_version(unsigned int *puint) { @@ -387,8 +350,12 @@ fetch_ubuntu_kernel_version(unsigned int *puint) size_t line_len = 0; char *ptr, *line = NULL; int version, patchlevel, sublevel, err; - FILE *vsig = fopen("/proc/version_signature", "r"); + FILE *vsig; + + if (!puint) + return 0; + vsig = fopen("/proc/version_signature", "r"); if (!vsig) { pr_debug("Open /proc/version_signature failed: %s\n", strerror(errno)); @@ -418,8 +385,7 @@ fetch_ubuntu_kernel_version(unsigned int *puint) goto errout; } - if (puint) - *puint = (version << 16) + (patchlevel << 8) + sublevel; + *puint = (version << 16) + (patchlevel << 8) + sublevel; err = 0; errout: free(line); @@ -446,6 +412,9 @@ fetch_kernel_version(unsigned int *puint, char *str, str[str_size - 1] = '\0'; } + if (!puint || int_ver_ready) + return 0; + err = sscanf(utsname.release, "%d.%d.%d", &version, &patchlevel, &sublevel); @@ -455,8 +424,7 @@ fetch_kernel_version(unsigned int *puint, char *str, return -1; } - if (puint && !int_ver_ready) - *puint = (version << 16) + (patchlevel << 8) + sublevel; + *puint = (version << 16) + (patchlevel << 8) + sublevel; return 0; } diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 5dfb9bb6482d..978572dfeb14 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -1,7 +1,6 @@ #ifndef GIT_COMPAT_UTIL_H #define GIT_COMPAT_UTIL_H -#define _ALL_SOURCE 1 #define _BSD_SOURCE 1 /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */ #define _DEFAULT_SOURCE 1 @@ -11,22 +10,14 @@ #include <stddef.h> #include <stdlib.h> #include <stdarg.h> +#include <linux/compiler.h> #include <linux/types.h> -#ifdef __GNUC__ -#define NORETURN __attribute__((__noreturn__)) -#else -#define NORETURN -#ifndef __attribute__ -#define __attribute__(x) -#endif -#endif - /* General helper functions */ -void usage(const char *err) NORETURN; -void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2))); -int error(const char *err, ...) __attribute__((format (printf, 1, 2))); -void warning(const char *err, ...) __attribute__((format (printf, 1, 2))); +void usage(const char *err) __noreturn; +void die(const char *err, ...) __noreturn __printf(1, 2); +int error(const char *err, ...) __printf(1, 2); +void warning(const char *err, ...) __printf(1, 2); void set_warning_routine(void (*routine)(const char *err, va_list params)); @@ -57,8 +48,6 @@ int hex2u64(const char *ptr, u64 *val); extern unsigned int page_size; extern int cacheline_size; -bool find_process(const char *name); - int fetch_kernel_version(unsigned int *puint, char *str, size_t str_sz); #define KVER_VERSION(x) (((x) >> 16) & 0xff) diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh index eee31e261bf7..70fca318a82b 100755 --- a/tools/testing/selftests/rcutorture/bin/configcheck.sh +++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh @@ -27,7 +27,7 @@ cat $1 > $T/.config cat $2 | sed -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' | awk ' -BEGIN { +{ print "if grep -q \"" $0 "\" < '"$T/.config"'"; print "then"; print "\t:"; diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 00cb0db2643d..c29f2ec0bf9f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -45,7 +45,7 @@ T=/tmp/test-linux.sh.$$ trap 'rm -rf $T' 0 mkdir $T -grep -v 'CONFIG_[A-Z]*_TORTURE_TEST' < ${config_template} > $T/config +grep -v 'CONFIG_[A-Z]*_TORTURE_TEST=' < ${config_template} > $T/config cat << ___EOF___ >> $T/config CONFIG_INITRAMFS_SOURCE="$TORTURE_INITRD" CONFIG_VIRTIO_PCI=y diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 3b3c1b693ee1..50091de3a911 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -296,10 +296,7 @@ if test -d .git then git status >> $resdir/$ds/testid.txt git rev-parse HEAD >> $resdir/$ds/testid.txt - if ! git diff HEAD > $T/git-diff 2>&1 - then - cp $T/git-diff $resdir/$ds - fi + git diff HEAD >> $resdir/$ds/testid.txt fi ___EOF___ awk < $T/cfgcpu.pack \ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index a3a1a05a2b5c..6a0b9f69faad 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -9,6 +9,8 @@ TREE08 TREE09 SRCU-N SRCU-P +SRCU-t +SRCU-u TINY01 TINY02 TASKS01 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot new file mode 100644 index 000000000000..84a7d51b7481 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot @@ -0,0 +1 @@ +rcutorture.torture_type=srcud diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N index 1a087c3c8bb8..2da8b49589a0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N @@ -5,4 +5,4 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n -CONFIG_RCU_EXPERT=y +#CHECK#CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P index 4837430a71c0..ab7ccd38232b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P @@ -2,7 +2,11 @@ CONFIG_RCU_TRACE=n CONFIG_SMP=y CONFIG_NR_CPUS=8 CONFIG_HOTPLUG_CPU=y +CONFIG_RCU_EXPERT=y +CONFIG_RCU_FANOUT=2 +CONFIG_RCU_FANOUT_LEAF=2 CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y -#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t new file mode 100644 index 000000000000..6c78022c8cd8 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t @@ -0,0 +1,10 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TINY_SRCU=y +CONFIG_RCU_TRACE=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_DEBUG_ATOMIC_SLEEP=y +#CHECK#CONFIG_PREEMPT_COUNT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot new file mode 100644 index 000000000000..238bfe3bd0cc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot @@ -0,0 +1 @@ +rcutorture.torture_type=srcu diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u new file mode 100644 index 000000000000..6bc24e99862f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u @@ -0,0 +1,9 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TINY_SRCU=y +CONFIG_RCU_TRACE=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot new file mode 100644 index 000000000000..84a7d51b7481 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot @@ -0,0 +1 @@ +rcutorture.torture_type=srcud diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 index a59f7686e219..d8674264318d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 @@ -6,10 +6,9 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=y CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=n -CONFIG_RCU_TRACE=y CONFIG_PROVE_LOCKING=y -CONFIG_PROVE_RCU_REPEATEDLY=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y -CONFIG_PREEMPT_COUNT=y +CONFIG_DEBUG_ATOMIC_SLEEP=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 359cb258f639..b5b53973c01e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -10,12 +10,9 @@ CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y CONFIG_MAXSMP=y +CONFIG_CPUMASK_OFFSTACK=y CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ZERO=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot index adc3abc82fb8..1d14e1383016 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot @@ -1 +1,5 @@ rcutorture.torture_type=rcu_bh maxcpus=8 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 +rcu_nocbs=0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 index c1ab5926568b..35e639e39366 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 @@ -18,9 +18,6 @@ CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y +CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 index 3b93ee544e70..2dc31b16e506 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 @@ -14,9 +14,5 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=y -CONFIG_RCU_KTHREAD_PRIO=2 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 120c0c88d100..5d2cc0bd50a0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1 +1,5 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 +rcutree.kthread_prio=2 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 5af758e783c7..27d22695d64c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -15,11 +15,7 @@ CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 CONFIG_RCU_FANOUT_LEAF=3 -CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y CONFIG_RCU_EQS_DEBUG=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index d4cdc0d74e16..2dde0d9964e3 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -13,12 +13,8 @@ CONFIG_HOTPLUG_CPU=y CONFIG_RCU_FANOUT=6 CONFIG_RCU_FANOUT_LEAF=6 CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_NONE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot index 15b3e1a86f74..c7fd050dfcd9 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot @@ -1,2 +1,5 @@ rcutorture.torture_type=sched rcupdate.rcu_self_test_sched=1 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 index 4cb02bd28f08..05a4eec3f27b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 @@ -18,8 +18,6 @@ CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot index dd90f28ed700..ad18b52a2cad 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot @@ -2,3 +2,6 @@ rcupdate.rcu_self_test=1 rcupdate.rcu_self_test_bh=1 rcupdate.rcu_self_test_sched=1 rcutree.rcu_fanout_exact=1 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index b12a3ea1867e..0f4759f4232e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -1,6 +1,5 @@ CONFIG_SMP=y CONFIG_NR_CPUS=16 -CONFIG_CPUMASK_OFFSTACK=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n @@ -9,16 +8,11 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ_FULL_ALL=n -CONFIG_NO_HZ_FULL_SYSIDLE=y CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y CONFIG_RCU_FANOUT=2 CONFIG_RCU_FANOUT_LEAF=2 -CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 index 099cc63c6a3b..fb1c763c10c5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 @@ -15,7 +15,6 @@ CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T deleted file mode 100644 index 2ad13f0d29cc..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T +++ /dev/null @@ -1,21 +0,0 @@ -CONFIG_SMP=y -CONFIG_NR_CPUS=16 -CONFIG_PREEMPT_NONE=n -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=y -#CHECK#CONFIG_PREEMPT_RCU=y -CONFIG_HZ_PERIODIC=n -CONFIG_NO_HZ_IDLE=y -CONFIG_NO_HZ_FULL=n -CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=y -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n -CONFIG_RCU_FANOUT=3 -CONFIG_RCU_FANOUT_LEAF=2 -CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ALL=y -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot index fb066dc82769..1bd8efc4141e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=sched rcupdate.rcu_self_test=1 rcupdate.rcu_self_test_sched=1 rcutree.rcu_fanout_exact=1 +rcu_nocbs=0-7 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcuperf/TINY index 917d2517b5b5..fb05ef5279b4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TINY @@ -1,21 +1,16 @@ -CONFIG_SMP=y -CONFIG_NR_CPUS=8 -CONFIG_PREEMPT_NONE=n +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=y -#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_PREEMPT=n +#CHECK#CONFIG_TINY_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=y -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n -CONFIG_RCU_FANOUT=3 -CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n -CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE index a312f671a29a..721cfda76ab2 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -7,7 +7,6 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=n CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 index 985fb170d13c..7629f5dd73b2 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 @@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=n CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt index 24396ae8355b..a75b16991a92 100644 --- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt +++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt @@ -18,7 +18,6 @@ CONFIG_PROVE_RCU In common code tested by TREE_RCU test cases. -CONFIG_NO_HZ_FULL_SYSIDLE CONFIG_RCU_NOCB_CPU Meaningless for TINY_RCU. diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 364801b1a230..9ad3f89c8dc7 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -9,28 +9,20 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one. CONFIG_HOTPLUG_CPU -- Do half. (Every second.) CONFIG_HZ_PERIODIC -- Do one. CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.) -CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE. -CONFIG_NO_HZ_FULL_SYSIDLE -- Do one. +CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement. CONFIG_PREEMPT -- Do half. (First three and #8.) CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. -CONFIG_PROVE_RCU_REPEATEDLY -- Do one. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. -CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. -CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL. -CONFIG_RCU_NOCB_CPU -- Do three, see below. -CONFIG_RCU_NOCB_CPU_ALL -- Do one. -CONFIG_RCU_NOCB_CPU_NONE -- Do one. -CONFIG_RCU_NOCB_CPU_ZERO -- Do one. +CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs. +CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with + rcu_nocbs=0, and one with all rcu_nocbs CPUs. CONFIG_RCU_TRACE -- Do half. CONFIG_SMP -- Need one !SMP for PREEMPT_RCU. CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations. CONFIG_RCU_EQS_DEBUG -- Do at least one for CONFIG_NO_HZ_FULL and not. -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -- Do for all but a couple TREE scenarios. -CONFIG_RCU_TORTURE_TEST_SLOW_INIT -- Do for all but a couple TREE scenarios. -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -- Do for all but a couple TREE scenarios. RCU-bh: Do one with PREEMPT and one with !PREEMPT. RCU-sched: Do one with PREEMPT but not BOOST. @@ -52,10 +44,6 @@ CONFIG_64BIT Used only to check CONFIG_RCU_FANOUT value, inspection suffices. -CONFIG_NO_HZ_FULL_SYSIDLE_SMALL - - Defer until Frederic uses this. - CONFIG_PREEMPT_COUNT CONFIG_PREEMPT_RCU @@ -78,30 +66,16 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE Always used in KVM testing. -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY -CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY - - Inspection suffices, ignore. - CONFIG_PREEMPT_RCU CONFIG_TREE_RCU CONFIG_TINY_RCU These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP. -CONFIG_SPARSE_RCU_POINTER - - Makes sense only for sparse runs, not for kernel builds. - CONFIG_SRCU CONFIG_TASKS_RCU Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable. -CONFIG_RCU_TRACE - - Implied by CONFIG_RCU_TRACE for Tree RCU. - boot parameters ignored: TBD diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk index 8ff89043d0a9..c9e8bc5082a7 100755 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk @@ -1,4 +1,4 @@ -#!/bin/awk -f +#!/usr/bin/awk -f # Modify SRCU for formal verification. The first argument should be srcu.h and # the second should be srcu.c. Outputs modified srcu.h and srcu.c into the diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile index 5fa1d7e9a915..5801bbefbe89 100644 --- a/tools/testing/selftests/timers/Makefile +++ b/tools/testing/selftests/timers/Makefile @@ -1,6 +1,6 @@ BUILD_FLAGS = -DKTEST CFLAGS += -O3 -Wl,-no-as-needed -Wall $(BUILD_FLAGS) -LDFLAGS += -lrt -lpthread +LDFLAGS += -lrt -lpthread -lm # these are all "safe" tests that don't modify # system time or require escalated privileges @@ -8,7 +8,7 @@ TEST_GEN_PROGS = posix_timers nanosleep nsleep-lat set-timer-lat mqueue-lat \ inconsistency-check raw_skew threadtest rtctest TEST_GEN_PROGS_EXTENDED = alarmtimer-suspend valid-adjtimex adjtick change_skew \ - skew_consistency clocksource-switch leap-a-day \ + skew_consistency clocksource-switch freq-step leap-a-day \ leapcrash set-tai set-2038 set-tz @@ -24,6 +24,7 @@ run_destructive_tests: run_tests ./change_skew ./skew_consistency ./clocksource-switch + ./freq-step ./leap-a-day -s -i 10 ./leapcrash ./set-tz diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c new file mode 100644 index 000000000000..e8c61830825a --- /dev/null +++ b/tools/testing/selftests/timers/freq-step.c @@ -0,0 +1,268 @@ +/* + * This test checks the response of the system clock to frequency + * steps made with adjtimex(). The frequency error and stability of + * the CLOCK_MONOTONIC clock relative to the CLOCK_MONOTONIC_RAW clock + * is measured in two intervals following the step. The test fails if + * values from the second interval exceed specified limits. + * + * Copyright (C) Miroslav Lichvar <mlichvar@redhat.com> 2017 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <math.h> +#include <stdio.h> +#include <sys/timex.h> +#include <time.h> +#include <unistd.h> + +#include "../kselftest.h" + +#define SAMPLES 100 +#define SAMPLE_READINGS 10 +#define MEAN_SAMPLE_INTERVAL 0.1 +#define STEP_INTERVAL 1.0 +#define MAX_PRECISION 100e-9 +#define MAX_FREQ_ERROR 10e-6 +#define MAX_STDDEV 1000e-9 + +struct sample { + double offset; + double time; +}; + +static time_t mono_raw_base; +static time_t mono_base; +static long user_hz; +static double precision; +static double mono_freq_offset; + +static double diff_timespec(struct timespec *ts1, struct timespec *ts2) +{ + return ts1->tv_sec - ts2->tv_sec + (ts1->tv_nsec - ts2->tv_nsec) / 1e9; +} + +static double get_sample(struct sample *sample) +{ + double delay, mindelay = 0.0; + struct timespec ts1, ts2, ts3; + int i; + + for (i = 0; i < SAMPLE_READINGS; i++) { + clock_gettime(CLOCK_MONOTONIC_RAW, &ts1); + clock_gettime(CLOCK_MONOTONIC, &ts2); + clock_gettime(CLOCK_MONOTONIC_RAW, &ts3); + + ts1.tv_sec -= mono_raw_base; + ts2.tv_sec -= mono_base; + ts3.tv_sec -= mono_raw_base; + + delay = diff_timespec(&ts3, &ts1); + if (delay <= 1e-9) { + i--; + continue; + } + + if (!i || delay < mindelay) { + sample->offset = diff_timespec(&ts2, &ts1); + sample->offset -= delay / 2.0; + sample->time = ts1.tv_sec + ts1.tv_nsec / 1e9; + mindelay = delay; + } + } + + return mindelay; +} + +static void reset_ntp_error(void) +{ + struct timex txc; + + txc.modes = ADJ_SETOFFSET; + txc.time.tv_sec = 0; + txc.time.tv_usec = 0; + + if (adjtimex(&txc) < 0) { + perror("[FAIL] adjtimex"); + ksft_exit_fail(); + } +} + +static void set_frequency(double freq) +{ + struct timex txc; + int tick_offset; + + tick_offset = 1e6 * freq / user_hz; + + txc.modes = ADJ_TICK | ADJ_FREQUENCY; + txc.tick = 1000000 / user_hz + tick_offset; + txc.freq = (1e6 * freq - user_hz * tick_offset) * (1 << 16); + + if (adjtimex(&txc) < 0) { + perror("[FAIL] adjtimex"); + ksft_exit_fail(); + } +} + +static void regress(struct sample *samples, int n, double *intercept, + double *slope, double *r_stddev, double *r_max) +{ + double x, y, r, x_sum, y_sum, xy_sum, x2_sum, r2_sum; + int i; + + x_sum = 0.0, y_sum = 0.0, xy_sum = 0.0, x2_sum = 0.0; + + for (i = 0; i < n; i++) { + x = samples[i].time; + y = samples[i].offset; + + x_sum += x; + y_sum += y; + xy_sum += x * y; + x2_sum += x * x; + } + + *slope = (xy_sum - x_sum * y_sum / n) / (x2_sum - x_sum * x_sum / n); + *intercept = (y_sum - *slope * x_sum) / n; + + *r_max = 0.0, r2_sum = 0.0; + + for (i = 0; i < n; i++) { + x = samples[i].time; + y = samples[i].offset; + r = fabs(x * *slope + *intercept - y); + if (*r_max < r) + *r_max = r; + r2_sum += r * r; + } + + *r_stddev = sqrt(r2_sum / n); +} + +static int run_test(int calibration, double freq_base, double freq_step) +{ + struct sample samples[SAMPLES]; + double intercept, slope, stddev1, max1, stddev2, max2; + double freq_error1, freq_error2; + int i; + + set_frequency(freq_base); + + for (i = 0; i < 10; i++) + usleep(1e6 * MEAN_SAMPLE_INTERVAL / 10); + + reset_ntp_error(); + + set_frequency(freq_base + freq_step); + + for (i = 0; i < 10; i++) + usleep(rand() % 2000000 * STEP_INTERVAL / 10); + + set_frequency(freq_base); + + for (i = 0; i < SAMPLES; i++) { + usleep(rand() % 2000000 * MEAN_SAMPLE_INTERVAL); + get_sample(&samples[i]); + } + + if (calibration) { + regress(samples, SAMPLES, &intercept, &slope, &stddev1, &max1); + mono_freq_offset = slope; + printf("CLOCK_MONOTONIC_RAW frequency offset: %11.3f ppm\n", + 1e6 * mono_freq_offset); + return 0; + } + + regress(samples, SAMPLES / 2, &intercept, &slope, &stddev1, &max1); + freq_error1 = slope * (1.0 - mono_freq_offset) - mono_freq_offset - + freq_base; + + regress(samples + SAMPLES / 2, SAMPLES / 2, &intercept, &slope, + &stddev2, &max2); + freq_error2 = slope * (1.0 - mono_freq_offset) - mono_freq_offset - + freq_base; + + printf("%6.0f %+10.3f %6.0f %7.0f %+10.3f %6.0f %7.0f\t", + 1e6 * freq_step, + 1e6 * freq_error1, 1e9 * stddev1, 1e9 * max1, + 1e6 * freq_error2, 1e9 * stddev2, 1e9 * max2); + + if (fabs(freq_error2) > MAX_FREQ_ERROR || stddev2 > MAX_STDDEV) { + printf("[FAIL]\n"); + return 1; + } + + printf("[OK]\n"); + return 0; +} + +static void init_test(void) +{ + struct timespec ts; + struct sample sample; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts)) { + perror("[FAIL] clock_gettime(CLOCK_MONOTONIC_RAW)"); + ksft_exit_fail(); + } + + mono_raw_base = ts.tv_sec; + + if (clock_gettime(CLOCK_MONOTONIC, &ts)) { + perror("[FAIL] clock_gettime(CLOCK_MONOTONIC)"); + ksft_exit_fail(); + } + + mono_base = ts.tv_sec; + + user_hz = sysconf(_SC_CLK_TCK); + + precision = get_sample(&sample) / 2.0; + printf("CLOCK_MONOTONIC_RAW+CLOCK_MONOTONIC precision: %.0f ns\t\t", + 1e9 * precision); + + if (precision > MAX_PRECISION) { + printf("[SKIP]\n"); + ksft_exit_skip(); + } + + printf("[OK]\n"); + srand(ts.tv_sec ^ ts.tv_nsec); + + run_test(1, 0.0, 0.0); +} + +int main(int argc, char **argv) +{ + double freq_base, freq_step; + int i, j, fails = 0; + + init_test(); + + printf("Checking response to frequency step:\n"); + printf(" Step 1st interval 2nd interval\n"); + printf(" Freq Dev Max Freq Dev Max\n"); + + for (i = 2; i >= 0; i--) { + for (j = 0; j < 5; j++) { + freq_base = (rand() % (1 << 24) - (1 << 23)) / 65536e6; + freq_step = 10e-6 * (1 << (6 * i)); + fails += run_test(0, freq_base, freq_step); + } + } + + set_frequency(0.0); + + if (fails) + ksft_exit_fail(); + + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c index caf1bc9257c4..74c60e8759a0 100644 --- a/tools/testing/selftests/timers/inconsistency-check.c +++ b/tools/testing/selftests/timers/inconsistency-check.c @@ -118,7 +118,7 @@ int consistency_test(int clock_type, unsigned long seconds) start_str = ctime(&t); while (seconds == -1 || now - then < seconds) { - inconsistent = 0; + inconsistent = -1; /* Fill list */ for (i = 0; i < CALLS_PER_LOOP; i++) @@ -130,7 +130,7 @@ int consistency_test(int clock_type, unsigned long seconds) inconsistent = i; /* display inconsistency */ - if (inconsistent) { + if (inconsistent >= 0) { unsigned long long delta; printf("\%s\n", start_str); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a8d540398bbd..9120edf3c94b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -184,7 +184,7 @@ int __attribute__((weak)) kvm_arch_set_irq_inatomic( * Called with wqh->lock held and interrupts disabled */ static int -irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct kvm_kernel_irqfd *irqfd = container_of(wait, struct kvm_kernel_irqfd, wait); |