summaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-01-04 01:45:13 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-01-04 01:45:52 +0100
commitc3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch)
treefc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c /libbb
parent1fc520ed286f815cae1da1e9f8014cb18a256744 (diff)
downloadbusybox-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz
libbb/sha1: x86_64 version: reorder prologue/epilogue insns
Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r--libbb/hash_md5_sha_x86-64.S60
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh67
2 files changed, 67 insertions, 60 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 95b85d80a..ff78fc049 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
.hidden sha1_process_block64
.type sha1_process_block64, @function
- .balign 8 # allow decoders to fetch at least 4 first insns
+ .balign 8 # allow decoders to fetch at least 5 first insns
sha1_process_block64:
- pushq %r15 #
- pushq %r14 #
- pushq %r13 #
- pushq %r12 #
- pushq %rbp #
- pushq %rbx #
+ pushq %rbp # 1 byte insn
+ pushq %rbx # 1 byte insn
+ pushq %r15 # 2 byte insn
+ pushq %r14 # 2 byte insn
+ pushq %r13 # 2 byte insn
+ pushq %r12 # 2 byte insn
pushq %rdi # we need ctx at the end
#Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
# esi,edi: temps
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
- movq 4*8(%rdi), %r8
- bswapq %r8
- movl %r8d, %r9d
- shrq $32, %r8
- movq 4*10(%rdi), %r10
- bswapq %r10
- movl %r10d, %r11d
- shrq $32, %r10
- movq 4*12(%rdi), %r12
- bswapq %r12
- movl %r12d, %r13d
- shrq $32, %r12
- movq 4*14(%rdi), %r14
- bswapq %r14
- movl %r14d, %r15d
- shrq $32, %r14
-
movl $3, %eax
1:
movq (%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
movq %rsi, -32(%rsp,%rax,8)
decl %eax
jns 1b
+
movl 80(%rdi), %eax # a = ctx->hash[0]
movl 84(%rdi), %ebx # b = ctx->hash[1]
movl 88(%rdi), %ecx # c = ctx->hash[2]
movl 92(%rdi), %edx # d = ctx->hash[3]
movl 96(%rdi), %ebp # e = ctx->hash[4]
+ movq 4*8(%rdi), %r8
+ movq 4*10(%rdi), %r10
+ bswapq %r8
+ bswapq %r10
+ movq 4*12(%rdi), %r12
+ movq 4*14(%rdi), %r14
+ bswapq %r12
+ bswapq %r14
+ movl %r8d, %r9d
+ shrq $32, %r8
+ movl %r10d, %r11d
+ shrq $32, %r10
+ movl %r12d, %r13d
+ shrq $32, %r12
+ movl %r14d, %r15d
+ shrq $32, %r14
+
# 0
# W[0], already in %esi
movl %ecx, %edi # c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
rorl $2, %ecx # b = rotl32(b,30)
popq %rdi #
+ popq %r12 #
addl %eax, 80(%rdi) # ctx->hash[0] += a
+ popq %r13 #
addl %ebx, 84(%rdi) # ctx->hash[1] += b
+ popq %r14 #
addl %ecx, 88(%rdi) # ctx->hash[2] += c
+ popq %r15 #
addl %edx, 92(%rdi) # ctx->hash[3] += d
- addl %ebp, 96(%rdi) # ctx->hash[4] += e
popq %rbx #
+ addl %ebp, 96(%rdi) # ctx->hash[4] += e
popq %rbp #
- popq %r12 #
- popq %r13 #
- popq %r14 #
- popq %r15 #
ret
.size sha1_process_block64, .-sha1_process_block64
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index c5f0ef504..7e50b64fb 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -15,14 +15,14 @@ echo \
.hidden sha1_process_block64
.type sha1_process_block64, @function
- .balign 8 # allow decoders to fetch at least 4 first insns
+ .balign 8 # allow decoders to fetch at least 5 first insns
sha1_process_block64:
- pushq %r15 #
- pushq %r14 #
- pushq %r13 #
- pushq %r12 #
- pushq %rbp #
- pushq %rbx #
+ pushq %rbp # 1 byte insn
+ pushq %rbx # 1 byte insn
+ pushq %r15 # 2 byte insn
+ pushq %r14 # 2 byte insn
+ pushq %r13 # 2 byte insn
+ pushq %r12 # 2 byte insn
pushq %rdi # we need ctx at the end
#Register and stack use:
@@ -31,24 +31,6 @@ sha1_process_block64:
# esi,edi: temps
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
- movq 4*8(%rdi), %r8
- bswapq %r8
- movl %r8d, %r9d
- shrq $32, %r8
- movq 4*10(%rdi), %r10
- bswapq %r10
- movl %r10d, %r11d
- shrq $32, %r10
- movq 4*12(%rdi), %r12
- bswapq %r12
- movl %r12d, %r13d
- shrq $32, %r12
- movq 4*14(%rdi), %r14
- bswapq %r14
- movl %r14d, %r15d
- shrq $32, %r14
-
movl $3, %eax
1:
movq (%rdi,%rax,8), %rsi
@@ -57,11 +39,29 @@ sha1_process_block64:
movq %rsi, -32(%rsp,%rax,8)
decl %eax
jns 1b
+
movl 80(%rdi), %eax # a = ctx->hash[0]
movl 84(%rdi), %ebx # b = ctx->hash[1]
movl 88(%rdi), %ecx # c = ctx->hash[2]
movl 92(%rdi), %edx # d = ctx->hash[3]
movl 96(%rdi), %ebp # e = ctx->hash[4]
+
+ movq 4*8(%rdi), %r8
+ movq 4*10(%rdi), %r10
+ bswapq %r8
+ bswapq %r10
+ movq 4*12(%rdi), %r12
+ movq 4*14(%rdi), %r14
+ bswapq %r12
+ bswapq %r14
+ movl %r8d, %r9d
+ shrq $32, %r8
+ movl %r10d, %r11d
+ shrq $32, %r10
+ movl %r12d, %r13d
+ shrq $32, %r12
+ movl %r14d, %r15d
+ shrq $32, %r14
'
W32() {
test "$1" || exit 1
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
test "$1" -ge 8 && echo "%r${1}d"
}
+# It's possible to interleave insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
+
RD1A() {
local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
local n=$(($6))
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
echo "
popq %rdi #
+ popq %r12 #
addl %eax, 80(%rdi) # ctx->hash[0] += a
+ popq %r13 #
addl %ebx, 84(%rdi) # ctx->hash[1] += b
+ popq %r14 #
addl %ecx, 88(%rdi) # ctx->hash[2] += c
+ popq %r15 #
addl %edx, 92(%rdi) # ctx->hash[3] += d
- addl %ebp, 96(%rdi) # ctx->hash[4] += e
popq %rbx #
+ addl %ebp, 96(%rdi) # ctx->hash[4] += e
popq %rbp #
- popq %r12 #
- popq %r13 #
- popq %r14 #
- popq %r15 #
ret
.size sha1_process_block64, .-sha1_process_block64