summaryrefslogtreecommitdiff
path: root/rtl
diff options
context:
space:
mode:
authorflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2012-03-10 11:33:20 +0000
committerflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2012-03-10 11:33:20 +0000
commite0fc2c1eb188745a1d5584ac55340c2637a32ccb (patch)
tree5de9fa6b523d5fbc69d0248a11239c7979d6a5a8 /rtl
parentc343028cdb7f3b1cc4efc285c0497ecdffe2386f (diff)
downloadfpc-e0fc2c1eb188745a1d5584ac55340c2637a32ccb.tar.gz
o patch by Nico Erfurth: Better Locked* implementation for arm on linux
The following functions where changed to make use of the kernel helper kuser_cmpxchg: InterLockedDecrement InterLockedIncrement InterLockedExchangeAdd InterLockedCompareExchange The previous implementation using a spinlock had a couple of drawbacks: 1.) The functions could not be used safely on values not completly managed by the process itself, because the spinlock did not protect data but the functions. For example, think about two processes using shared memory. They would not be able to share fpc_system_lock, making it unsafe to use these functions. 2.) With many active threads, there was a high chance that the scheduler would interrupt a thread while fpc_system_lock was taken, which would result in the following threads using one of these functions to spinlock till the end of its timeslice. This could result in unwanted and unnecessary latencies. 3.) Every function contained a pointer to fpc_system_lock. Resulting in two polluted DCache-Lines per call and possible latencies through dcache misses. The new implementation only works on Linux Kernel >= 2.6.16 The functions are implemented in a way which tries to minimize cache pollution and load latencies. Even without Multithreading the new functions are a lot faster. I've did comparisons on my Kirkwood 1.2GHz with the following template code: var X: longint; begin X := 0; while X < longint(100*1000000) do FUNCTION(X); Writeln(X); end. Function New Old InterLockedIncrement: 0m3.696s 0m23.220s InterLockedExchangeAdd: 0m4.034s 0m23.242s InterLockedCompareExchange: 0m4.703s 0m24.006s This speedup is most probably because of the reduced memory access, which resulted in lots of cache misses. git-svn-id: http://svn.freepascal.org/svn/fpc/trunk@20491 3ad0048d-3df7-0310-abae-a5850022a9f2
Diffstat (limited to 'rtl')
-rw-r--r--rtl/arm/arm.inc100
1 files changed, 100 insertions, 0 deletions
diff --git a/rtl/arm/arm.inc b/rtl/arm/arm.inc
index af82f19b99..7aeff925e3 100644
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@@ -561,6 +561,32 @@ asm
mov r0, r1
bx lr
{$else}
+{$if defined(linux)}
+
+ stmfd r13!, {lr}
+ mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_dec_loop:
+ ldr r0, [r2] // Load the current value
+
+ // We expect this to work without looping most of the time
+ // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+ // loop here again, we have to reload the value. Normaly this just fills the
+ // load stall-cycles from the above ldr so in reality we'll not get any additional
+ // delays because of this
+ // Don't use ldr to load r3 to avoid cacheline trashing
+ // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+ // the kuser_cmpxchg entry point
+ mvn r3, #0x0000f000
+ sub r3, r3, #0x3F
+
+ sub r1, r0, #1 // Decrement value
+ blx r3 // Call kuser_cmpxchg, sets C-Flag on success
+
+ movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
+ ldmcsfd r13!, {pc}
+ b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
@@ -580,6 +606,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
+{$endif}
end;
@@ -595,6 +622,32 @@ asm
mov r0, r1
bx lr
{$else}
+{$if defined(linux)}
+
+ stmfd r13!, {lr}
+ mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_inc_loop:
+ ldr r0, [r2] // Load the current value
+
+ // We expect this to work without looping most of the time
+ // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+ // loop here again, we have to reload the value. Normaly this just fills the
+ // load stall-cycles from the above ldr so in reality we'll not get any additional
+ // delays because of this
+ // Don't use ldr to load r3 to avoid cacheline trashing
+ // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+ // the kuser_cmpxchg entry point
+ mvn r3, #0x0000f000
+ sub r3, r3, #0x3F
+
+ add r1, r0, #1 // Decrement value
+ blx r3 // Call kuser_cmpxchg, sets C-Flag on success
+
+ movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
+ ldmcsfd r13!, {pc}
+ b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
@@ -614,6 +667,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
+{$endif}
end;
@@ -646,6 +700,33 @@ asm
mov r0, r2
bx lr
{$else}
+{$if defined(linux)}
+
+ stmfd r13!, {r4, lr}
+ mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
+ mov r4, r1 // Save addend
+.Latomic_add_loop:
+ ldr r0, [r2] // Load the current value
+
+ // We expect this to work without looping most of the time
+ // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+ // loop here again, we have to reload the value. Normaly this just fills the
+ // load stall-cycles from the above ldr so in reality we'll not get any additional
+ // delays because of this
+ // Don't use ldr to load r3 to avoid cacheline trashing
+ // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+ // the kuser_cmpxchg entry point
+ mvn r3, #0x0000f000
+ sub r3, r3, #0x3F
+
+ add r1, r0, r4 // Add to value
+ blx r3 // Call kuser_cmpxchg, sets C-Flag on success
+
+ movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
+ ldmcsfd r13!, {r4, pc}
+ b .Latomic_add_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r2, #1
@@ -666,6 +747,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
+{$endif}
end;
@@ -682,6 +764,23 @@ asm
mov r0, r3
bx lr
{$else}
+{$if defined(linux)}
+
+ stmfd r13!, {lr}
+
+ mvn r3, #0x0000f000
+ sub r3, r3, #0x3F
+
+ mov ip, r2 // Swap parameters around
+ mov r2, r0
+ mov r0, ip
+
+ blx r3 // Call kuser_cmpxchg sets C-Flag on success
+ ldrcc r0, [r2] // Load the currently set value on failure
+ // We could use "mov r0, r3" here, but thats undocumented
+ ldmfd r13!, {lr}
+
+{$else}
// lock
ldr r12, .Lfpc_system_lock
mov r3, #1
@@ -702,6 +801,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
+{$endif}
end;
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}