aarch64: Optimize SHA1 Compress

This patch optimizes SHA1 compress function for arm64 architecture by taking advantage of SHA-1 instructions of Armv8 crypto extension. The SHA-1 instructions: SHA1C: SHA1 hash update (choose) SHA1H: SHA1 fixed rotate SHA1M: SHA1 hash update (majority) SHA1P: SHA1 hash update (parity) SHA1SU0: SHA1 schedule update 0 SHA1SU1: SHA1 schedule update 1 Benchmark on gcc117 instance of CFarm before applying the patch: Algorithm mode Mbyte/s sha1 update 214.16 openssl sha1 update 849.44 hmac-sha1 64 bytes 61.69 hmac-sha1 256 bytes 131.50 hmac-sha1 1024 bytes 185.20 hmac-sha1 4096 bytes 204.55 hmac-sha1 single msg 210.97 Benchmark on gcc117 instance of CFarm after applying the patch: Algorithm mode Mbyte/s sha1 update 800.80 openssl sha1 update 849.17 hmac-sha1 64 bytes 166.10 hmac-sha1 256 bytes 409.24 hmac-sha1 1024 bytes 636.98 hmac-sha1 4096 bytes 739.20 hmac-sha1 single msg 775.67
author: Maamoun TK <maamoun.tk@googlemail.com> 2021-05-14 08:45:33 +0300
committer: Niels Möller <nisse@lysator.liu.se> 2021-06-01 18:20:52 +0200
commit: 47cafcf29951b7e9c5c1d1c4f34d29c6b2bf84c6 (patch)
tree: b64fa76f44dd016326d235b032d56a2aa4606dd7 /arm64
parent: a46a17e9f57c64984d5246aa3475e45f8c562ec7 (diff)
download: nettle-47cafcf29951b7e9c5c1d1c4f34d29c6b2bf84c6.tar.gz
3 files changed, 260 insertions, 0 deletions
diff --git a/arm64/README b/arm64/README
index d2745d57..206bb773 100644
--- a/arm64/README
+++ b/arm64/README
@@ -83,5 +83,12 @@ particular care must be taken if the loaded data is then to be regarded as
 elements of e.g. a doubleword vector. Indicies may appear reversed on
 big-endian systems (because they are).
 
+Hardware-accelerated SHA Instructions
+
+The SHA optimized cores are implemented using SHA hashing instructions added
+to AArch64 in crypto extensions. The repository [3] illustrates using those
+instructions for optimizing SHA hashing functions.
+
 [1] https://github.com/ARM-software/abi-aa/releases/download/2020Q4/aapcs64.pdf
 [2] https://llvm.org/docs/BigEndianNEON.html
+[3] https://github.com/noloader/SHA-Intrinsics
diff --git a/arm64/crypto/sha1-compress.asm b/arm64/crypto/sha1-compress.asm
new file mode 100644
index 00000000..de3d7b7e
--- /dev/null
+++ b/arm64/crypto/sha1-compress.asm
@@ -0,0 +1,246 @@
+C arm64/crypto/sha1-compress.asm
+
+ifelse(`
+   Copyright (C) 2021 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C This implementation uses the SHA-1 instructions of Armv8 crypto
+C extension.
+C SHA1C: SHA1 hash update (choose)
+C SHA1H: SHA1 fixed rotate
+C SHA1M: SHA1 hash update (majority)
+C SHA1P: SHA1 hash update (parity)
+C SHA1SU0: SHA1 schedule update 0
+C SHA1SU1: SHA1 schedule update 1
+
+.file "sha1-compress.asm"
+.arch armv8-a+crypto
+
+.text
+
+C Register usage:
+
+define(`STATE', `x0')
+define(`INPUT', `x1')
+
+define(`CONST0', `v0')
+define(`CONST1', `v1')
+define(`CONST2', `v2')
+define(`CONST3', `v3')
+define(`MSG0', `v4')
+define(`MSG1', `v5')
+define(`MSG2', `v6')
+define(`MSG3', `v7')
+define(`ABCD', `v16')
+define(`ABCD_SAVED', `v17')
+define(`E0', `v18')
+define(`E0_SAVED', `v19')
+define(`E1', `v20')
+define(`TMP', `v21')
+
+C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)
+
+PROLOGUE(nettle_sha1_compress)
+    C Initialize constants
+    mov            w2,#0x7999
+    movk           w2,#0x5A82,lsl #16
+    dup            CONST0.4s,w2
+    mov            w2,#0xEBA1
+    movk           w2,#0x6ED9,lsl #16
+    dup            CONST1.4s,w2
+    mov            w2,#0xBCDC
+    movk           w2,#0x8F1B,lsl #16
+    dup            CONST2.4s,w2
+    mov            w2,#0xC1D6
+    movk           w2,#0xCA62,lsl #16
+    dup            CONST3.4s,w2
+
+    C Load state
+    add            x2,STATE,#16
+    movi           E0.4s,#0
+    ld1            {ABCD.4s},[STATE]
+    ld1            {E0.s}[0],[x2]
+
+    C Save state
+    mov            ABCD_SAVED.16b,ABCD.16b
+    mov            E0_SAVED.16b,E0.16b
+
+    C Load message
+    ld1            {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT]
+
+    C Reverse for little endian
+    rev32          MSG0.16b,MSG0.16b
+    rev32          MSG1.16b,MSG1.16b
+    rev32          MSG2.16b,MSG2.16b
+    rev32          MSG3.16b,MSG3.16b
+
+    C Rounds 0-3
+    add            TMP.4s,MSG0.4s,CONST0.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 4-7
+    add            TMP.4s,MSG1.4s,CONST0.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 8-11
+    add            TMP.4s,MSG2.4s,CONST0.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 12-15
+    add            TMP.4s,MSG3.4s,CONST0.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 16-19
+    add            TMP.4s,MSG0.4s,CONST0.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1c          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 20-23
+    add            TMP.4s,MSG1.4s,CONST1.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 24-27
+    add            TMP.4s,MSG2.4s,CONST1.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 28-31
+    add            TMP.4s,MSG3.4s,CONST1.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 32-35
+    add            TMP.4s,MSG0.4s,CONST1.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 36-39
+    add            TMP.4s,MSG1.4s,CONST1.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 40-43
+    add            TMP.4s,MSG2.4s,CONST2.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 44-47
+    add            TMP.4s,MSG3.4s,CONST2.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 48-51
+    add            TMP.4s,MSG0.4s,CONST2.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 52-55
+    add            TMP.4s,MSG1.4s,CONST2.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG0.4s,MSG3.4s
+    sha1su0        MSG1.4s,MSG2.4s,MSG3.4s
+
+    C Rounds 56-59
+    add            TMP.4s,MSG2.4s,CONST2.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1m          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG1.4s,MSG0.4s
+    sha1su0        MSG2.4s,MSG3.4s,MSG0.4s
+
+    C Rounds 60-63
+    add            TMP.4s,MSG3.4s,CONST3.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG2.4s,MSG1.4s
+    sha1su0        MSG3.4s,MSG0.4s,MSG1.4s
+
+    C Rounds 64-67
+    add            TMP.4s,MSG0.4s,CONST3.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP.4s
+    sha1su1        MSG3.4s,MSG2.4s
+    sha1su0        MSG0.4s,MSG1.4s,MSG2.4s
+
+    C Rounds 68-71
+    add            TMP.4s,MSG1.4s,CONST3.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+    sha1su1        MSG0.4s,MSG3.4s
+
+    C Rounds 72-75
+    add            TMP.4s,MSG2.4s,CONST3.4s
+    sha1h          SFP(E1),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E0),TMP.4s
+
+    C Rounds 76-79
+    add            TMP.4s,MSG3.4s,CONST3.4s
+    sha1h          SFP(E0),SFP(ABCD)
+    sha1p          QFP(ABCD),SFP(E1),TMP.4s
+
+    C Combine state
+    add            E0.4s,E0.4s,E0_SAVED.4s
+    add            ABCD.4s,ABCD.4s,ABCD_SAVED.4s
+
+    C Store state
+    st1            {ABCD.4s},[STATE]
+    st1            {E0.s}[0],[x2]
+
+    ret
+EPILOGUE(nettle_sha1_compress)
diff --git a/arm64/machine.m4 b/arm64/machine.m4
index e69de29b..7df62bcc 100644
--- a/arm64/machine.m4
+++ b/arm64/machine.m4
@@ -0,0 +1,7 @@
+C Get 32-bit floating-point register from vector register
+C SFP(VR)
+define(`SFP',``s'substr($1,1,len($1))')
+
+C Get 128-bit floating-point register from vector register
+C QFP(VR)
+define(`QFP',``q'substr($1,1,len($1))')
author	Maamoun TK <maamoun.tk@googlemail.com>	2021-05-14 08:45:33 +0300
committer	Niels Möller <nisse@lysator.liu.se>	2021-06-01 18:20:52 +0200
commit	47cafcf29951b7e9c5c1d1c4f34d29c6b2bf84c6 (patch)
tree	b64fa76f44dd016326d235b032d56a2aa4606dd7 /arm64
parent	a46a17e9f57c64984d5246aa3475e45f8c562ec7 (diff)
download	nettle-47cafcf29951b7e9c5c1d1c4f34d29c6b2bf84c6.tar.gz