summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraoeu <aoeuh@yandex.ru>2021-03-10 10:42:33 +0000
committeraoeu <aoeuh@yandex.ru>2021-03-10 10:42:33 +0000
commite8041519cd159440a6b35ef94c8d03ec4fecc429 (patch)
tree03a8a9ef5909816eb7157cf2938c41dcab48896b
parentac48e507e3dabc5701706a2a4f9a59430a4137b7 (diff)
downloadnss-hg-e8041519cd159440a6b35ef94c8d03ec4fecc429.tar.gz
Bug 1613235 - Add POWER ChaCha20 stream cipher vector acceleration. r=bbeurdouche
Differential Revision: https://phabricator.services.mozilla.com/D107220
-rw-r--r--lib/freebl/Makefile3
-rw-r--r--lib/freebl/chacha20-ppc64le.S546
-rw-r--r--lib/freebl/chacha20poly1305-ppc.c588
-rw-r--r--lib/freebl/chacha20poly1305.c47
-rw-r--r--lib/freebl/freebl.gyp9
-rw-r--r--lib/freebl/freebl_base.gypi6
6 files changed, 1199 insertions, 0 deletions
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index 269e34c5c..10654360c 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -298,6 +298,8 @@ ifdef USE_64
PPC_ABI := $(shell $(CC) -dM -E - < /dev/null | awk '$$2 == "_CALL_ELF" {print $$3}')
ifeq ($(PPC_ABI),2)
ASFILES += sha512-p8.s
+ EXTRA_SRCS += chacha20poly1305-ppc.c
+ ASFILES += chacha20-ppc64le.s
endif
endif # USE_64
endif # ppc
@@ -762,6 +764,7 @@ $(OBJDIR)/$(PROG_PREFIX)gcm$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx
$(OBJDIR)/$(PROG_PREFIX)rijndael$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx
$(OBJDIR)/$(PROG_PREFIX)sha512$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx \
-funroll-loops -fpeel-loops
+$(OBJDIR)/$(PROG_PREFIX)chacha20poly1305-ppc$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx
endif
endif
diff --git a/lib/freebl/chacha20-ppc64le.S b/lib/freebl/chacha20-ppc64le.S
new file mode 100644
index 000000000..241bef41f
--- /dev/null
+++ b/lib/freebl/chacha20-ppc64le.S
@@ -0,0 +1,546 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+# vs0 - vs15 : buffer for xor
+# vs32 - vs47 (v0 - v15) : 4 "converted" states
+# vs48 - vs51 (v16 - v19) : original state
+# vs52 - vs55 (v20 - v23) : "converted" constants
+# vs56 (v24) : "converted" counter
+# vs57 (v25) : increment for "converted" counter
+# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor
+
+#include <ppc-asm.h>
+
+.equ rSIZE, r3
+.equ rDST, r4
+.equ rSRC, r5
+.equ rKEY, r6
+.equ rNONCE, r7
+.equ rCNTR, r8
+
+.abiversion 2
+.section ".data"
+.align 5
+lblock: .skip 256
+cnts0: .long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+cnts1: .long 0x61707865, 0x61707865, 0x61707865, 0x61707865
+cnts2: .long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
+cnts3: .long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
+cnts4: .long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
+st4: .long 0, 0, 0, 0
+cntr: .long 0, 0, 0, 0
+incr: .long 4, 4, 4, 4
+rotl1: .long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD
+rotl2: .long 12, 12, 12, 12
+rotl3: .long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC
+rotl4: .long 7, 7, 7, 7
+
+.section ".text"
+.align 5
+.globl chacha20vsx
+.type chacha20vsx, @function
+chacha20vsx:
+ # prologue
+ addis 2, r12, .TOC.-chacha20vsx@ha
+ addi 2, 2, .TOC.-chacha20vsx@l
+ .localentry chacha20vsx, .-chacha20vsx
+ std r14, -8(sp)
+ std r15, -16(sp)
+ std r16, -24(sp)
+ std r17, -32(sp)
+ std r18, -40(sp)
+ std r19, -48(sp)
+ std r20, -56(sp)
+ std r21, -64(sp)
+ std r22, -72(sp)
+ std r23, -80(sp)
+ std r24, -88(sp)
+ std r25, -96(sp)
+ std r26, -104(sp)
+ std r27, -112(sp)
+ std r28, -120(sp)
+ std r29, -128(sp)
+ std r30, -136(sp)
+ std r31, -144(sp)
+
+ addi r14, sp, -160
+
+ li r16, -16
+ li r17, -32
+ li r18, -48
+ li r19, -64
+ li r20, -80
+ li r21, -96
+ li r22, -112
+ li r23, -128
+ li r24, -144
+ li r25, -160
+ li r26, -176
+ li r27, -192
+ li r28, -208
+
+ # save f14, f15
+ stxvw4x vs14, 0, r14
+ stxvw4x vs15, r16, r14
+
+ # save v20 - v31
+ stxvw4x vs52, r17, r14
+ stxvw4x vs53, r18, r14
+ stxvw4x vs54, r19, r14
+ stxvw4x vs55, r20, r14
+ stxvw4x vs56, r21, r14
+ stxvw4x vs57, r22, r14
+ stxvw4x vs58, r23, r14
+ stxvw4x vs59, r24, r14
+ stxvw4x vs60, r25, r14
+ stxvw4x vs61, r26, r14
+ stxvw4x vs62, r27, r14
+ stxvw4x vs63, r28, r14
+
+ # offset in src/dst
+ li r17, 16
+ li r18, 32
+ li r19, 48
+ li r20, 64
+ li r21, 80
+ li r22, 96
+ li r23, 112
+ li r24, 128
+ li r25, 144
+ li r26, 160
+ li r27, 176
+ li r28, 192
+ li r29, 208
+ li r30, 224
+ li r31, 240
+
+ # load const's address
+ addis r14, 2, cnts0@toc@ha
+ addi r14, r14, cnts0@toc@l
+
+ # save nonce to st4
+ lwz r15, 0(rNONCE)
+ stw r15, 84(r14)
+ lwz r15, 4(rNONCE)
+ stw r15, 88(r14)
+ lwz r15, 8(rNONCE)
+ stw r15, 92(r14)
+
+ # load state to vectors
+ lxvw4x vs48, 0, r14
+ lxvw4x vs49, 0, rKEY
+ lxvw4x vs50, r17, rKEY
+ lxvw4x vs51, r21, r14
+
+ # load consts for x4 rounds
+ lxvw4x vs52, r17, r14
+ lxvw4x vs53, r18, r14
+ lxvw4x vs54, r19, r14
+ lxvw4x vs55, r20, r14
+
+ # counter
+ stw rCNTR, 96(r14)
+ addi rCNTR, rCNTR, 1
+ stw rCNTR, 100(r14)
+ addi rCNTR, rCNTR, 1
+ stw rCNTR, 104(r14)
+ addi rCNTR, rCNTR, 1
+ stw rCNTR, 108(r14)
+ lxvw4x vs56, r22, r14
+
+ # load increment
+ lxvw4x vs57, r23, r14
+
+ # load rotl to vectors
+ lxvw4x vs60, r24, r14
+ lxvw4x vs61, r25, r14
+ lxvw4x vs62, r26, r14
+ lxvw4x vs63, r27, r14
+
+ # counter for loop = size/256
+ li r15, 256
+ divdu. r16, rSIZE, r15
+ beq lastblock
+ mtctr r16
+
+mainloop:
+ # init 16 vectors (4 states x4)
+ vor v0, v20, v20
+ vor v1, v21, v21
+ vor v2, v22, v22
+ vor v3, v23, v23
+ vspltw v4, v17, v0
+ vspltw v5, v17, v1
+ vspltw v6, v17, v2
+ vspltw v7, v17, v3
+ vspltw v8, v18, v0
+ vspltw v9, v18, v1
+ vspltw v10, v18, v2
+ vspltw v11, v18, v3
+ vor v12, v24, v24
+ vspltw v13, v19, v1
+ vspltw v14, v19, v2
+ vspltw v15, v19, v3
+
+.macro _plus a b_y b_x
+ vadduwm \a, \a, \b_y*4+(\b_x)%4
+ vadduwm \a+1, \a+1, \b_y*4+(\b_x+1)%4
+ vadduwm \a+2, \a+2, \b_y*4+(\b_x+2)%4
+ vadduwm \a+3, \a+3, \b_y*4+(\b_x+3)%4
+.endm
+
+.macro _xor a b_y b_x
+ vxor \a, \a, \b_y*4+(\b_x)%4
+ vxor \a+1, \a+1, \b_y*4+(\b_x+1)%4
+ vxor \a+2, \a+2, \b_y*4+(\b_x+2)%4
+ vxor \a+3, \a+3, \b_y*4+(\b_x+3)%4
+.endm
+
+.macro _rotl a b
+ vrlw \a, \a, \b
+ vrlw \a+1, \a+1, \b
+ vrlw \a+2, \a+2, \b
+ vrlw \a+3, \a+3, \b
+.endm
+
+.macro _pxor a b_y b_x c
+ vpermxor \a, \a, \b_y*4+(\b_x)%4, \c
+ vpermxor \a+1, \a+1, \b_y*4+(\b_x+1)%4, \c
+ vpermxor \a+2, \a+2, \b_y*4+(\b_x+2)%4, \c
+ vpermxor \a+3, \a+3, \b_y*4+(\b_x+3)%4, \c
+.endm
+
+# 00 01 02 03
+# 04 05 06 07
+# 08 09 10 11
+# 12 13 14 15
+.macro doubleround
+ # column round
+ _plus v0, v1, v0 # a+=b
+ _pxor v12, v0, v0, v28 # d^=a; d<<<=16
+ _plus v8, v3, v0 # c+=d
+ _xor v4, v2, v0 # b^=c
+ _rotl v4, v29 # b<<<=12
+ _plus v0, v1, v0 # a+=b
+ _pxor v12, v0, v0, v30 # d^=a; d<<<=8
+ _plus v8, v3, v0 # c+=d
+ _xor v4, v2, v0 # b^=c
+ _rotl v4, v31 # b<<<=7
+
+ # diagonal round
+ _plus v0, v1, v1 # a+=b
+ _pxor v12, v0, v1, v28 # d^=a; d<<<=16
+ _plus v8, v3, v1 # c+=d
+ _xor v4, v2, v1 # b^=c
+ _rotl v4, v29 # b<<<=12
+ _plus v0, v1, v1 # a+=b
+ _pxor v12, v0, v1, v30 # d^=a; d<<<=8
+ _plus v8, v3, v1 # c+=d
+ _xor v4, v2, v1 # b^=c
+ _rotl v4, v31 # b<<<=7
+.endm
+
+ doubleround # 1
+ doubleround # 2
+ doubleround # 3
+ doubleround # 4
+ doubleround # 5
+ doubleround # 6
+ doubleround # 7
+ doubleround # 8
+ doubleround # 9
+ doubleround # 10
+
+ # counter += original counter
+ vadduwm v12, v12, v24
+
+.macro convert a
+ vmrgew 26, 0+\a, 1+\a
+ vmrgew 27, 2+\a, 3+\a
+ vmrgow 0+\a, 0+\a, 1+\a
+ vmrgow 2+\a, 2+\a, 3+\a
+ xxmrghd 33+\a, 32+\a, 34+\a
+ xxmrgld 35+\a, 32+\a, 34+\a
+ xxmrghd 32+\a, 58, 59
+ xxmrgld 34+\a, 58, 59
+.endm
+
+ convert 0
+ convert 4
+ convert 8
+ convert 12
+
+.macro addition a
+ vadduwm 0+\a, 0+\a, 16
+ vadduwm 4+\a, 4+\a, 17
+ vadduwm 8+\a, 8+\a, 18
+ vadduwm 12+\a, 12+\a, 19
+.endm
+
+ addition 0
+ addition 1
+ addition 2
+ addition 3
+
+ # load text/cipher
+ lxvw4x vs0, 0, rSRC
+ lxvw4x vs1, r17, rSRC
+ lxvw4x vs2, r18, rSRC
+ lxvw4x vs3, r19, rSRC
+ lxvw4x vs4, r20, rSRC
+ lxvw4x vs5, r21, rSRC
+ lxvw4x vs6, r22, rSRC
+ lxvw4x vs7, r23, rSRC
+ lxvw4x vs8, r24, rSRC
+ lxvw4x vs9, r25, rSRC
+ lxvw4x vs10, r26, rSRC
+ lxvw4x vs11, r27, rSRC
+ lxvw4x vs12, r28, rSRC
+ lxvw4x vs13, r29, rSRC
+ lxvw4x vs14, r30, rSRC
+ lxvw4x vs15, r31, rSRC
+ # xor (encrypt/decrypt)
+ xxlxor vs0, vs0, vs32
+ xxlxor vs1, vs1, vs36
+ xxlxor vs2, vs2, vs40
+ xxlxor vs3, vs3, vs44
+ xxlxor vs4, vs4, vs33
+ xxlxor vs5, vs5, vs37
+ xxlxor vs6, vs6, vs41
+ xxlxor vs7, vs7, vs45
+ xxlxor vs8, vs8, vs34
+ xxlxor vs9, vs9, vs38
+ xxlxor vs10, vs10, vs42
+ xxlxor vs11, vs11, vs46
+ xxlxor vs12, vs12, vs35
+ xxlxor vs13, vs13, vs39
+ xxlxor vs14, vs14, vs43
+ xxlxor vs15, vs15, vs47
+ # store cipher/text
+ stxvw4x vs0, 0, rDST
+ stxvw4x vs1, r17, rDST
+ stxvw4x vs2, r18, rDST
+ stxvw4x vs3, r19, rDST
+ stxvw4x vs4, r20, rDST
+ stxvw4x vs5, r21, rDST
+ stxvw4x vs6, r22, rDST
+ stxvw4x vs7, r23, rDST
+ stxvw4x vs8, r24, rDST
+ stxvw4x vs9, r25, rDST
+ stxvw4x vs10, r26, rDST
+ stxvw4x vs11, r27, rDST
+ stxvw4x vs12, r28, rDST
+ stxvw4x vs13, r29, rDST
+ stxvw4x vs14, r30, rDST
+ stxvw4x vs15, r31, rDST
+
+ # src/dst increment
+ addi rSRC, rSRC, 256
+ addi rDST, rDST, 256
+
+ # counter increment
+ vadduwm v24, v24, v25
+
+ bdnz mainloop
+
+lastblock:
+ # reminder
+ mulld r16, r16, r15
+ subf. r16, r16, rSIZE
+
+ # check reminder
+ beq exitsub
+
+ addi r14, r14, -256
+ # last block x4
+ # init 16 vectors (4 states x4)
+ vor v0, v20, v20
+ vor v1, v21, v21
+ vor v2, v22, v22
+ vor v3, v23, v23
+ vspltw v4, v17, v0
+ vspltw v5, v17, v1
+ vspltw v6, v17, v2
+ vspltw v7, v17, v3
+ vspltw v8, v18, v0
+ vspltw v9, v18, v1
+ vspltw v10, v18, v2
+ vspltw v11, v18, v3
+ vor v12, v24, v24
+ vspltw v13, v19, v1
+ vspltw v14, v19, v2
+ vspltw v15, v19, v3
+
+ doubleround # 1
+ doubleround # 2
+ doubleround # 3
+ doubleround # 4
+ doubleround # 5
+ doubleround # 6
+ doubleround # 7
+ doubleround # 8
+ doubleround # 9
+ doubleround # 10
+
+ vadduwm v12, v12, v24
+
+ convert 0
+ convert 4
+ convert 8
+ convert 12
+
+ addition 0
+ addition 1
+ addition 2
+ addition 3
+
+ # store vectors
+ stxvw4x vs32, 0, r14
+ stxvw4x vs36, r17, r14
+ stxvw4x vs40, r18, r14
+ stxvw4x vs44, r19, r14
+ stxvw4x vs33, r20, r14
+ stxvw4x vs37, r21, r14
+ stxvw4x vs41, r22, r14
+ stxvw4x vs45, r23, r14
+ stxvw4x vs34, r24, r14
+ stxvw4x vs38, r25, r14
+ stxvw4x vs42, r26, r14
+ stxvw4x vs46, r27, r14
+ stxvw4x vs35, r28, r14
+ stxvw4x vs39, r29, r14
+ stxvw4x vs43, r30, r14
+ stxvw4x vs47, r31, r14
+
+ mtctr r16
+ addi rSIZE, r14, -1
+ addi rSRC, rSRC, -1
+ addi rDST, rDST, -1
+xorlast:
+ lbzu r15, 1(rSIZE)
+ lbzu r16, 1(rSRC)
+ xor r15, r15, r16
+ stbu r15, 1(rDST)
+ bdnz xorlast
+
+ # zeroing last block
+ xxlxor vs0, vs0, vs0
+ stxvw4x vs0, 0, r14
+ stxvw4x vs0, r17, r14
+ stxvw4x vs0, r18, r14
+ stxvw4x vs0, r19, r14
+ stxvw4x vs0, r20, r14
+ stxvw4x vs0, r21, r14
+ stxvw4x vs0, r22, r14
+ stxvw4x vs0, r23, r14
+ stxvw4x vs0, r24, r14
+ stxvw4x vs0, r25, r14
+ stxvw4x vs0, r26, r14
+ stxvw4x vs0, r27, r14
+ stxvw4x vs0, r28, r14
+ stxvw4x vs0, r29, r14
+ stxvw4x vs0, r30, r14
+ stxvw4x vs0, r31, r14
+
+exitsub:
+ # zeroing volatile registers
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+ xxlxor vs8, vs8, vs8
+ xxlxor vs9, vs9, vs9
+ xxlxor vs10, vs10, vs10
+ xxlxor vs11, vs11, vs11
+ xxlxor vs12, vs12, vs12
+ xxlxor vs13, vs13, vs13
+
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+
+ li rSIZE, 0
+ li rDST, 0
+ li rSRC, 0
+ li rKEY, 0
+ li rNONCE, 0
+ li rCNTR, 0
+
+ # epilogue
+ addi r14, sp, -160
+
+ li r16, -16
+ li r17, -32
+ li r18, -48
+ li r19, -64
+ li r20, -80
+ li r21, -96
+ li r22, -112
+ li r23, -128
+ li r24, -144
+ li r25, -160
+ li r26, -176
+ li r27, -192
+ li r28, -208
+
+ # load f14, f15
+ lxvw4x vs14, 0, r14
+ lxvw4x vs15, r16, r14
+
+ # load v20 - v31
+ lxvw4x vs52, r17, r14
+ lxvw4x vs53, r18, r14
+ lxvw4x vs54, r19, r14
+ lxvw4x vs55, r20, r14
+ lxvw4x vs56, r21, r14
+ lxvw4x vs57, r22, r14
+ lxvw4x vs58, r23, r14
+ lxvw4x vs59, r24, r14
+ lxvw4x vs60, r25, r14
+ lxvw4x vs61, r26, r14
+ lxvw4x vs62, r27, r14
+ lxvw4x vs63, r28, r14
+
+ ld r14, -8(sp)
+ ld r15, -16(sp)
+ ld r16, -24(sp)
+ ld r17, -32(sp)
+ ld r18, -40(sp)
+ ld r19, -48(sp)
+ ld r20, -56(sp)
+ ld r21, -64(sp)
+ ld r22, -72(sp)
+ ld r23, -80(sp)
+ ld r24, -88(sp)
+ ld r25, -96(sp)
+ ld r26, -104(sp)
+ ld r27, -112(sp)
+ ld r28, -120(sp)
+ ld r29, -128(sp)
+ ld r30, -136(sp)
+ ld r31, -144(sp)
+
+ blr
diff --git a/lib/freebl/chacha20poly1305-ppc.c b/lib/freebl/chacha20poly1305-ppc.c
new file mode 100644
index 000000000..55101ceb2
--- /dev/null
+++ b/lib/freebl/chacha20poly1305-ppc.c
@@ -0,0 +1,588 @@
+/* MIT License
+ *
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "Hacl_Chacha20Poly1305_32.h"
+
+/* Forward declaration from chacha20-ppc64le.S */
+void chacha20vsx(uint32_t len, uint8_t *output, uint8_t *block, uint8_t *k,
+ uint8_t *nonce, uint32_t ctr);
+
+static inline void
+poly1305_padded_32(uint64_t *ctx, uint32_t len, uint8_t *text)
+{
+ uint32_t n = len / (uint32_t)16U;
+ uint32_t r = len % (uint32_t)16U;
+ uint8_t *blocks = text;
+ uint8_t *rem = text + n * (uint32_t)16U;
+ uint64_t *pre0 = ctx + (uint32_t)5U;
+ uint64_t *acc0 = ctx;
+ uint32_t nb = n * (uint32_t)16U / (uint32_t)16U;
+ uint32_t rem1 = n * (uint32_t)16U % (uint32_t)16U;
+ for (uint32_t i = (uint32_t)0U; i < nb; i++) {
+ uint8_t *block = blocks + i * (uint32_t)16U;
+ uint64_t e[5U] = { 0U };
+ uint64_t u0 = load64_le(block);
+ uint64_t lo = u0;
+ uint64_t u = load64_le(block + (uint32_t)8U);
+ uint64_t hi = u;
+ uint64_t f0 = lo;
+ uint64_t f1 = hi;
+ uint64_t f010 = f0 & (uint64_t)0x3ffffffU;
+ uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU;
+ uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U;
+ uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU;
+ uint64_t f40 = f1 >> (uint32_t)40U;
+ uint64_t f01 = f010;
+ uint64_t f111 = f110;
+ uint64_t f2 = f20;
+ uint64_t f3 = f30;
+ uint64_t f41 = f40;
+ e[0U] = f01;
+ e[1U] = f111;
+ e[2U] = f2;
+ e[3U] = f3;
+ e[4U] = f41;
+ uint64_t b = (uint64_t)0x1000000U;
+ uint64_t mask = b;
+ uint64_t f4 = e[4U];
+ e[4U] = f4 | mask;
+ uint64_t *r1 = pre0;
+ uint64_t *r5 = pre0 + (uint32_t)5U;
+ uint64_t r0 = r1[0U];
+ uint64_t r11 = r1[1U];
+ uint64_t r2 = r1[2U];
+ uint64_t r3 = r1[3U];
+ uint64_t r4 = r1[4U];
+ uint64_t r51 = r5[1U];
+ uint64_t r52 = r5[2U];
+ uint64_t r53 = r5[3U];
+ uint64_t r54 = r5[4U];
+ uint64_t f10 = e[0U];
+ uint64_t f11 = e[1U];
+ uint64_t f12 = e[2U];
+ uint64_t f13 = e[3U];
+ uint64_t f14 = e[4U];
+ uint64_t a0 = acc0[0U];
+ uint64_t a1 = acc0[1U];
+ uint64_t a2 = acc0[2U];
+ uint64_t a3 = acc0[3U];
+ uint64_t a4 = acc0[4U];
+ uint64_t a01 = a0 + f10;
+ uint64_t a11 = a1 + f11;
+ uint64_t a21 = a2 + f12;
+ uint64_t a31 = a3 + f13;
+ uint64_t a41 = a4 + f14;
+ uint64_t a02 = r0 * a01;
+ uint64_t a12 = r11 * a01;
+ uint64_t a22 = r2 * a01;
+ uint64_t a32 = r3 * a01;
+ uint64_t a42 = r4 * a01;
+ uint64_t a03 = a02 + r54 * a11;
+ uint64_t a13 = a12 + r0 * a11;
+ uint64_t a23 = a22 + r11 * a11;
+ uint64_t a33 = a32 + r2 * a11;
+ uint64_t a43 = a42 + r3 * a11;
+ uint64_t a04 = a03 + r53 * a21;
+ uint64_t a14 = a13 + r54 * a21;
+ uint64_t a24 = a23 + r0 * a21;
+ uint64_t a34 = a33 + r11 * a21;
+ uint64_t a44 = a43 + r2 * a21;
+ uint64_t a05 = a04 + r52 * a31;
+ uint64_t a15 = a14 + r53 * a31;
+ uint64_t a25 = a24 + r54 * a31;
+ uint64_t a35 = a34 + r0 * a31;
+ uint64_t a45 = a44 + r11 * a31;
+ uint64_t a06 = a05 + r51 * a41;
+ uint64_t a16 = a15 + r52 * a41;
+ uint64_t a26 = a25 + r53 * a41;
+ uint64_t a36 = a35 + r54 * a41;
+ uint64_t a46 = a45 + r0 * a41;
+ uint64_t t0 = a06;
+ uint64_t t1 = a16;
+ uint64_t t2 = a26;
+ uint64_t t3 = a36;
+ uint64_t t4 = a46;
+ uint64_t mask26 = (uint64_t)0x3ffffffU;
+ uint64_t z0 = t0 >> (uint32_t)26U;
+ uint64_t z1 = t3 >> (uint32_t)26U;
+ uint64_t x0 = t0 & mask26;
+ uint64_t x3 = t3 & mask26;
+ uint64_t x1 = t1 + z0;
+ uint64_t x4 = t4 + z1;
+ uint64_t z01 = x1 >> (uint32_t)26U;
+ uint64_t z11 = x4 >> (uint32_t)26U;
+ uint64_t t = z11 << (uint32_t)2U;
+ uint64_t z12 = z11 + t;
+ uint64_t x11 = x1 & mask26;
+ uint64_t x41 = x4 & mask26;
+ uint64_t x2 = t2 + z01;
+ uint64_t x01 = x0 + z12;
+ uint64_t z02 = x2 >> (uint32_t)26U;
+ uint64_t z13 = x01 >> (uint32_t)26U;
+ uint64_t x21 = x2 & mask26;
+ uint64_t x02 = x01 & mask26;
+ uint64_t x31 = x3 + z02;
+ uint64_t x12 = x11 + z13;
+ uint64_t z03 = x31 >> (uint32_t)26U;
+ uint64_t x32 = x31 & mask26;
+ uint64_t x42 = x41 + z03;
+ uint64_t o0 = x02;
+ uint64_t o1 = x12;
+ uint64_t o2 = x21;
+ uint64_t o3 = x32;
+ uint64_t o4 = x42;
+ acc0[0U] = o0;
+ acc0[1U] = o1;
+ acc0[2U] = o2;
+ acc0[3U] = o3;
+ acc0[4U] = o4;
+ }
+ if (rem1 > (uint32_t)0U) {
+ uint8_t *last = blocks + nb * (uint32_t)16U;
+ uint64_t e[5U] = { 0U };
+ uint8_t tmp[16U] = { 0U };
+ memcpy(tmp, last, rem1 * sizeof(last[0U]));
+ uint64_t u0 = load64_le(tmp);
+ uint64_t lo = u0;
+ uint64_t u = load64_le(tmp + (uint32_t)8U);
+ uint64_t hi = u;
+ uint64_t f0 = lo;
+ uint64_t f1 = hi;
+ uint64_t f010 = f0 & (uint64_t)0x3ffffffU;
+ uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU;
+ uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U;
+ uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU;
+ uint64_t f40 = f1 >> (uint32_t)40U;
+ uint64_t f01 = f010;
+ uint64_t f111 = f110;
+ uint64_t f2 = f20;
+ uint64_t f3 = f30;
+ uint64_t f4 = f40;
+ e[0U] = f01;
+ e[1U] = f111;
+ e[2U] = f2;
+ e[3U] = f3;
+ e[4U] = f4;
+ uint64_t b = (uint64_t)1U << rem1 * (uint32_t)8U % (uint32_t)26U;
+ uint64_t mask = b;
+ uint64_t fi = e[rem1 * (uint32_t)8U / (uint32_t)26U];
+ e[rem1 * (uint32_t)8U / (uint32_t)26U] = fi | mask;
+ uint64_t *r1 = pre0;
+ uint64_t *r5 = pre0 + (uint32_t)5U;
+ uint64_t r0 = r1[0U];
+ uint64_t r11 = r1[1U];
+ uint64_t r2 = r1[2U];
+ uint64_t r3 = r1[3U];
+ uint64_t r4 = r1[4U];
+ uint64_t r51 = r5[1U];
+ uint64_t r52 = r5[2U];
+ uint64_t r53 = r5[3U];
+ uint64_t r54 = r5[4U];
+ uint64_t f10 = e[0U];
+ uint64_t f11 = e[1U];
+ uint64_t f12 = e[2U];
+ uint64_t f13 = e[3U];
+ uint64_t f14 = e[4U];
+ uint64_t a0 = acc0[0U];
+ uint64_t a1 = acc0[1U];
+ uint64_t a2 = acc0[2U];
+ uint64_t a3 = acc0[3U];
+ uint64_t a4 = acc0[4U];
+ uint64_t a01 = a0 + f10;
+ uint64_t a11 = a1 + f11;
+ uint64_t a21 = a2 + f12;
+ uint64_t a31 = a3 + f13;
+ uint64_t a41 = a4 + f14;
+ uint64_t a02 = r0 * a01;
+ uint64_t a12 = r11 * a01;
+ uint64_t a22 = r2 * a01;
+ uint64_t a32 = r3 * a01;
+ uint64_t a42 = r4 * a01;
+ uint64_t a03 = a02 + r54 * a11;
+ uint64_t a13 = a12 + r0 * a11;
+ uint64_t a23 = a22 + r11 * a11;
+ uint64_t a33 = a32 + r2 * a11;
+ uint64_t a43 = a42 + r3 * a11;
+ uint64_t a04 = a03 + r53 * a21;
+ uint64_t a14 = a13 + r54 * a21;
+ uint64_t a24 = a23 + r0 * a21;
+ uint64_t a34 = a33 + r11 * a21;
+ uint64_t a44 = a43 + r2 * a21;
+ uint64_t a05 = a04 + r52 * a31;
+ uint64_t a15 = a14 + r53 * a31;
+ uint64_t a25 = a24 + r54 * a31;
+ uint64_t a35 = a34 + r0 * a31;
+ uint64_t a45 = a44 + r11 * a31;
+ uint64_t a06 = a05 + r51 * a41;
+ uint64_t a16 = a15 + r52 * a41;
+ uint64_t a26 = a25 + r53 * a41;
+ uint64_t a36 = a35 + r54 * a41;
+ uint64_t a46 = a45 + r0 * a41;
+ uint64_t t0 = a06;
+ uint64_t t1 = a16;
+ uint64_t t2 = a26;
+ uint64_t t3 = a36;
+ uint64_t t4 = a46;
+ uint64_t mask26 = (uint64_t)0x3ffffffU;
+ uint64_t z0 = t0 >> (uint32_t)26U;
+ uint64_t z1 = t3 >> (uint32_t)26U;
+ uint64_t x0 = t0 & mask26;
+ uint64_t x3 = t3 & mask26;
+ uint64_t x1 = t1 + z0;
+ uint64_t x4 = t4 + z1;
+ uint64_t z01 = x1 >> (uint32_t)26U;
+ uint64_t z11 = x4 >> (uint32_t)26U;
+ uint64_t t = z11 << (uint32_t)2U;
+ uint64_t z12 = z11 + t;
+ uint64_t x11 = x1 & mask26;
+ uint64_t x41 = x4 & mask26;
+ uint64_t x2 = t2 + z01;
+ uint64_t x01 = x0 + z12;
+ uint64_t z02 = x2 >> (uint32_t)26U;
+ uint64_t z13 = x01 >> (uint32_t)26U;
+ uint64_t x21 = x2 & mask26;
+ uint64_t x02 = x01 & mask26;
+ uint64_t x31 = x3 + z02;
+ uint64_t x12 = x11 + z13;
+ uint64_t z03 = x31 >> (uint32_t)26U;
+ uint64_t x32 = x31 & mask26;
+ uint64_t x42 = x41 + z03;
+ uint64_t o0 = x02;
+ uint64_t o1 = x12;
+ uint64_t o2 = x21;
+ uint64_t o3 = x32;
+ uint64_t o4 = x42;
+ acc0[0U] = o0;
+ acc0[1U] = o1;
+ acc0[2U] = o2;
+ acc0[3U] = o3;
+ acc0[4U] = o4;
+ }
+ uint8_t tmp[16U] = { 0U };
+ memcpy(tmp, rem, r * sizeof(rem[0U]));
+ if (r > (uint32_t)0U) {
+ uint64_t *pre = ctx + (uint32_t)5U;
+ uint64_t *acc = ctx;
+ uint64_t e[5U] = { 0U };
+ uint64_t u0 = load64_le(tmp);
+ uint64_t lo = u0;
+ uint64_t u = load64_le(tmp + (uint32_t)8U);
+ uint64_t hi = u;
+ uint64_t f0 = lo;
+ uint64_t f1 = hi;
+ uint64_t f010 = f0 & (uint64_t)0x3ffffffU;
+ uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU;
+ uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U;
+ uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU;
+ uint64_t f40 = f1 >> (uint32_t)40U;
+ uint64_t f01 = f010;
+ uint64_t f111 = f110;
+ uint64_t f2 = f20;
+ uint64_t f3 = f30;
+ uint64_t f41 = f40;
+ e[0U] = f01;
+ e[1U] = f111;
+ e[2U] = f2;
+ e[3U] = f3;
+ e[4U] = f41;
+ uint64_t b = (uint64_t)0x1000000U;
+ uint64_t mask = b;
+ uint64_t f4 = e[4U];
+ e[4U] = f4 | mask;
+ uint64_t *r1 = pre;
+ uint64_t *r5 = pre + (uint32_t)5U;
+ uint64_t r0 = r1[0U];
+ uint64_t r11 = r1[1U];
+ uint64_t r2 = r1[2U];
+ uint64_t r3 = r1[3U];
+ uint64_t r4 = r1[4U];
+ uint64_t r51 = r5[1U];
+ uint64_t r52 = r5[2U];
+ uint64_t r53 = r5[3U];
+ uint64_t r54 = r5[4U];
+ uint64_t f10 = e[0U];
+ uint64_t f11 = e[1U];
+ uint64_t f12 = e[2U];
+ uint64_t f13 = e[3U];
+ uint64_t f14 = e[4U];
+ uint64_t a0 = acc[0U];
+ uint64_t a1 = acc[1U];
+ uint64_t a2 = acc[2U];
+ uint64_t a3 = acc[3U];
+ uint64_t a4 = acc[4U];
+ uint64_t a01 = a0 + f10;
+ uint64_t a11 = a1 + f11;
+ uint64_t a21 = a2 + f12;
+ uint64_t a31 = a3 + f13;
+ uint64_t a41 = a4 + f14;
+ uint64_t a02 = r0 * a01;
+ uint64_t a12 = r11 * a01;
+ uint64_t a22 = r2 * a01;
+ uint64_t a32 = r3 * a01;
+ uint64_t a42 = r4 * a01;
+ uint64_t a03 = a02 + r54 * a11;
+ uint64_t a13 = a12 + r0 * a11;
+ uint64_t a23 = a22 + r11 * a11;
+ uint64_t a33 = a32 + r2 * a11;
+ uint64_t a43 = a42 + r3 * a11;
+ uint64_t a04 = a03 + r53 * a21;
+ uint64_t a14 = a13 + r54 * a21;
+ uint64_t a24 = a23 + r0 * a21;
+ uint64_t a34 = a33 + r11 * a21;
+ uint64_t a44 = a43 + r2 * a21;
+ uint64_t a05 = a04 + r52 * a31;
+ uint64_t a15 = a14 + r53 * a31;
+ uint64_t a25 = a24 + r54 * a31;
+ uint64_t a35 = a34 + r0 * a31;
+ uint64_t a45 = a44 + r11 * a31;
+ uint64_t a06 = a05 + r51 * a41;
+ uint64_t a16 = a15 + r52 * a41;
+ uint64_t a26 = a25 + r53 * a41;
+ uint64_t a36 = a35 + r54 * a41;
+ uint64_t a46 = a45 + r0 * a41;
+ uint64_t t0 = a06;
+ uint64_t t1 = a16;
+ uint64_t t2 = a26;
+ uint64_t t3 = a36;
+ uint64_t t4 = a46;
+ uint64_t mask26 = (uint64_t)0x3ffffffU;
+ uint64_t z0 = t0 >> (uint32_t)26U;
+ uint64_t z1 = t3 >> (uint32_t)26U;
+ uint64_t x0 = t0 & mask26;
+ uint64_t x3 = t3 & mask26;
+ uint64_t x1 = t1 + z0;
+ uint64_t x4 = t4 + z1;
+ uint64_t z01 = x1 >> (uint32_t)26U;
+ uint64_t z11 = x4 >> (uint32_t)26U;
+ uint64_t t = z11 << (uint32_t)2U;
+ uint64_t z12 = z11 + t;
+ uint64_t x11 = x1 & mask26;
+ uint64_t x41 = x4 & mask26;
+ uint64_t x2 = t2 + z01;
+ uint64_t x01 = x0 + z12;
+ uint64_t z02 = x2 >> (uint32_t)26U;
+ uint64_t z13 = x01 >> (uint32_t)26U;
+ uint64_t x21 = x2 & mask26;
+ uint64_t x02 = x01 & mask26;
+ uint64_t x31 = x3 + z02;
+ uint64_t x12 = x11 + z13;
+ uint64_t z03 = x31 >> (uint32_t)26U;
+ uint64_t x32 = x31 & mask26;
+ uint64_t x42 = x41 + z03;
+ uint64_t o0 = x02;
+ uint64_t o1 = x12;
+ uint64_t o2 = x21;
+ uint64_t o3 = x32;
+ uint64_t o4 = x42;
+ acc[0U] = o0;
+ acc[1U] = o1;
+ acc[2U] = o2;
+ acc[3U] = o3;
+ acc[4U] = o4;
+ return;
+ }
+}
+
+static inline void
+poly1305_do_32(
+ uint8_t *k,
+ uint32_t aadlen,
+ uint8_t *aad,
+ uint32_t mlen,
+ uint8_t *m,
+ uint8_t *out)
+{
+ uint64_t ctx[25U] = { 0U };
+ uint8_t block[16U] = { 0U };
+ Hacl_Poly1305_32_poly1305_init(ctx, k);
+ poly1305_padded_32(ctx, aadlen, aad);
+ poly1305_padded_32(ctx, mlen, m);
+ store64_le(block, (uint64_t)aadlen);
+ store64_le(block + (uint32_t)8U, (uint64_t)mlen);
+ uint64_t *pre = ctx + (uint32_t)5U;
+ uint64_t *acc = ctx;
+ uint64_t e[5U] = { 0U };
+ uint64_t u0 = load64_le(block);
+ uint64_t lo = u0;
+ uint64_t u = load64_le(block + (uint32_t)8U);
+ uint64_t hi = u;
+ uint64_t f0 = lo;
+ uint64_t f1 = hi;
+ uint64_t f010 = f0 & (uint64_t)0x3ffffffU;
+ uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU;
+ uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U;
+ uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU;
+ uint64_t f40 = f1 >> (uint32_t)40U;
+ uint64_t f01 = f010;
+ uint64_t f111 = f110;
+ uint64_t f2 = f20;
+ uint64_t f3 = f30;
+ uint64_t f41 = f40;
+ e[0U] = f01;
+ e[1U] = f111;
+ e[2U] = f2;
+ e[3U] = f3;
+ e[4U] = f41;
+ uint64_t b = (uint64_t)0x1000000U;
+ uint64_t mask = b;
+ uint64_t f4 = e[4U];
+ e[4U] = f4 | mask;
+ uint64_t *r = pre;
+ uint64_t *r5 = pre + (uint32_t)5U;
+ uint64_t r0 = r[0U];
+ uint64_t r1 = r[1U];
+ uint64_t r2 = r[2U];
+ uint64_t r3 = r[3U];
+ uint64_t r4 = r[4U];
+ uint64_t r51 = r5[1U];
+ uint64_t r52 = r5[2U];
+ uint64_t r53 = r5[3U];
+ uint64_t r54 = r5[4U];
+ uint64_t f10 = e[0U];
+ uint64_t f11 = e[1U];
+ uint64_t f12 = e[2U];
+ uint64_t f13 = e[3U];
+ uint64_t f14 = e[4U];
+ uint64_t a0 = acc[0U];
+ uint64_t a1 = acc[1U];
+ uint64_t a2 = acc[2U];
+ uint64_t a3 = acc[3U];
+ uint64_t a4 = acc[4U];
+ uint64_t a01 = a0 + f10;
+ uint64_t a11 = a1 + f11;
+ uint64_t a21 = a2 + f12;
+ uint64_t a31 = a3 + f13;
+ uint64_t a41 = a4 + f14;
+ uint64_t a02 = r0 * a01;
+ uint64_t a12 = r1 * a01;
+ uint64_t a22 = r2 * a01;
+ uint64_t a32 = r3 * a01;
+ uint64_t a42 = r4 * a01;
+ uint64_t a03 = a02 + r54 * a11;
+ uint64_t a13 = a12 + r0 * a11;
+ uint64_t a23 = a22 + r1 * a11;
+ uint64_t a33 = a32 + r2 * a11;
+ uint64_t a43 = a42 + r3 * a11;
+ uint64_t a04 = a03 + r53 * a21;
+ uint64_t a14 = a13 + r54 * a21;
+ uint64_t a24 = a23 + r0 * a21;
+ uint64_t a34 = a33 + r1 * a21;
+ uint64_t a44 = a43 + r2 * a21;
+ uint64_t a05 = a04 + r52 * a31;
+ uint64_t a15 = a14 + r53 * a31;
+ uint64_t a25 = a24 + r54 * a31;
+ uint64_t a35 = a34 + r0 * a31;
+ uint64_t a45 = a44 + r1 * a31;
+ uint64_t a06 = a05 + r51 * a41;
+ uint64_t a16 = a15 + r52 * a41;
+ uint64_t a26 = a25 + r53 * a41;
+ uint64_t a36 = a35 + r54 * a41;
+ uint64_t a46 = a45 + r0 * a41;
+ uint64_t t0 = a06;
+ uint64_t t1 = a16;
+ uint64_t t2 = a26;
+ uint64_t t3 = a36;
+ uint64_t t4 = a46;
+ uint64_t mask26 = (uint64_t)0x3ffffffU;
+ uint64_t z0 = t0 >> (uint32_t)26U;
+ uint64_t z1 = t3 >> (uint32_t)26U;
+ uint64_t x0 = t0 & mask26;
+ uint64_t x3 = t3 & mask26;
+ uint64_t x1 = t1 + z0;
+ uint64_t x4 = t4 + z1;
+ uint64_t z01 = x1 >> (uint32_t)26U;
+ uint64_t z11 = x4 >> (uint32_t)26U;
+ uint64_t t = z11 << (uint32_t)2U;
+ uint64_t z12 = z11 + t;
+ uint64_t x11 = x1 & mask26;
+ uint64_t x41 = x4 & mask26;
+ uint64_t x2 = t2 + z01;
+ uint64_t x01 = x0 + z12;
+ uint64_t z02 = x2 >> (uint32_t)26U;
+ uint64_t z13 = x01 >> (uint32_t)26U;
+ uint64_t x21 = x2 & mask26;
+ uint64_t x02 = x01 & mask26;
+ uint64_t x31 = x3 + z02;
+ uint64_t x12 = x11 + z13;
+ uint64_t z03 = x31 >> (uint32_t)26U;
+ uint64_t x32 = x31 & mask26;
+ uint64_t x42 = x41 + z03;
+ uint64_t o0 = x02;
+ uint64_t o1 = x12;
+ uint64_t o2 = x21;
+ uint64_t o3 = x32;
+ uint64_t o4 = x42;
+ acc[0U] = o0;
+ acc[1U] = o1;
+ acc[2U] = o2;
+ acc[3U] = o3;
+ acc[4U] = o4;
+ Hacl_Poly1305_32_poly1305_finish(out, k, ctx);
+}
+
+void
+Chacha20Poly1305_vsx_aead_encrypt(
+ uint8_t *k,
+ uint8_t *n,
+ uint32_t aadlen,
+ uint8_t *aad,
+ uint32_t mlen,
+ uint8_t *m,
+ uint8_t *cipher,
+ uint8_t *mac)
+{
+ chacha20vsx(mlen, cipher, m, k, n, (uint32_t)1U);
+ uint8_t tmp[64U] = { 0U };
+ chacha20vsx((uint32_t)64U, tmp, tmp, k, n, (uint32_t)0U);
+ uint8_t *key = tmp;
+ poly1305_do_32(key, aadlen, aad, mlen, cipher, mac);
+}
+
+uint32_t
+Chacha20Poly1305_vsx_aead_decrypt(
+ uint8_t *k,
+ uint8_t *n,
+ uint32_t aadlen,
+ uint8_t *aad,
+ uint32_t mlen,
+ uint8_t *m,
+ uint8_t *cipher,
+ uint8_t *mac)
+{
+ uint8_t computed_mac[16U] = { 0U };
+ uint8_t tmp[64U] = { 0U };
+ chacha20vsx((uint32_t)64U, tmp, tmp, k, n, (uint32_t)0U);
+ uint8_t *key = tmp;
+ poly1305_do_32(key, aadlen, aad, mlen, cipher, computed_mac);
+ uint8_t res = (uint8_t)255U;
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)16U; i++) {
+ uint8_t uu____0 = FStar_UInt8_eq_mask(computed_mac[i], mac[i]);
+ res = uu____0 & res;
+ }
+ uint8_t z = res;
+ if (z == (uint8_t)255U) {
+ chacha20vsx(mlen, m, cipher, k, n, (uint32_t)1U);
+ return (uint32_t)0U;
+ }
+ return (uint32_t)1U;
+}
diff --git a/lib/freebl/chacha20poly1305.c b/lib/freebl/chacha20poly1305.c
index 5c294a9ea..aa1a63fe4 100644
--- a/lib/freebl/chacha20poly1305.c
+++ b/lib/freebl/chacha20poly1305.c
@@ -69,6 +69,20 @@ Hacl_Chacha20Poly1305_32_aead_decrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen,
uint8_t *aad, uint32_t mlen, uint8_t *m,
uint8_t *cipher, uint8_t *mac);
+// Forward declaration from chacha20-ppc64le.S
+void chacha20vsx(uint32_t len, uint8_t *output, uint8_t *block, uint8_t *k,
+ uint8_t *nonce, uint32_t ctr);
+
+// Forward declaration from chacha20poly1305-ppc.c
+extern void
+Chacha20Poly1305_vsx_aead_encrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen,
+ uint8_t *aad, uint32_t mlen, uint8_t *m,
+ uint8_t *cipher, uint8_t *mac);
+extern uint32_t
+Chacha20Poly1305_vsx_aead_decrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen,
+ uint8_t *aad, uint32_t mlen, uint8_t *m,
+ uint8_t *cipher, uint8_t *mac);
+
SECStatus
ChaCha20Poly1305_InitContext(ChaCha20Poly1305Context *ctx,
const unsigned char *key, unsigned int keyLen,
@@ -144,6 +158,11 @@ ChaCha20Xor(uint8_t *output, uint8_t *block, uint32_t len, uint8_t *k,
}
#endif
} else
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \
+ !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX)
+ if (__builtin_cpu_supports ("vsx")) {
+ chacha20vsx(len, output, block, k, nonce, ctr);
+ } else
#endif
{
Hacl_Chacha20_chacha20_encrypt(len, output, block, k, nonce, ctr);
@@ -212,6 +231,13 @@ ChaCha20Poly1305_Seal(const ChaCha20Poly1305Context *ctx, unsigned char *output,
}
#endif
} else
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \
+ !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX)
+ if (__builtin_cpu_supports ("vsx")) {
+ Chacha20Poly1305_vsx_aead_encrypt(
+ (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen,
+ (uint8_t *)input, output, output + inputLen);
+ } else
#endif
{
Hacl_Chacha20Poly1305_32_aead_encrypt(
@@ -274,6 +300,13 @@ ChaCha20Poly1305_Open(const ChaCha20Poly1305Context *ctx, unsigned char *output,
}
#endif
} else
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \
+ !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX)
+ if (__builtin_cpu_supports ("vsx")) {
+ res = Chacha20Poly1305_vsx_aead_decrypt(
+ (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen,
+ (uint8_t *)output, (uint8_t *)input, (uint8_t *)input + ciphertextLen);
+ } else
#endif
{
res = Hacl_Chacha20Poly1305_32_aead_decrypt(
@@ -323,6 +356,13 @@ ChaCha20Poly1305_Encrypt(const ChaCha20Poly1305Context *ctx,
(uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen,
(uint8_t *)input, output, outTag);
} else
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \
+ !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX)
+ if (__builtin_cpu_supports ("vsx")) {
+ Chacha20Poly1305_vsx_aead_encrypt(
+ (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen,
+ (uint8_t *)input, output, outTag);
+ } else
#endif
{
Hacl_Chacha20Poly1305_32_aead_encrypt(
@@ -370,6 +410,13 @@ ChaCha20Poly1305_Decrypt(const ChaCha20Poly1305Context *ctx,
(uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen,
(uint8_t *)output, (uint8_t *)input, (uint8_t *)tagIn);
} else
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \
+ !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX)
+ if (__builtin_cpu_supports ("vsx")) {
+ res = Chacha20Poly1305_vsx_aead_decrypt(
+ (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen,
+ (uint8_t *)output, (uint8_t *)input, (uint8_t *)tagIn);
+ } else
#endif
{
res = Hacl_Chacha20Poly1305_32_aead_decrypt(
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index 6578fac6a..19807e401 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -325,6 +325,14 @@
],
},
{
+ 'target_name': 'chacha20-ppc_lib',
+ 'type': 'static_library',
+ 'sources': [
+ 'chacha20poly1305-ppc.c',
+ 'chacha20-ppc64le.S',
+ ]
+ },
+ {
'target_name': 'armv8_c_lib',
'type': 'static_library',
'sources': [
@@ -410,6 +418,7 @@
'dependencies': [
'gcm-aes-ppc_c_lib',
'gcm-sha512-ppc_c_lib',
+ 'chacha20-ppc_lib',
],
}],
[ 'disable_altivec==1 and (target_arch=="ppc64" or target_arch=="ppc64le")', {
diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi
index 39ec14982..afbffac72 100644
--- a/lib/freebl/freebl_base.gypi
+++ b/lib/freebl/freebl_base.gypi
@@ -95,6 +95,12 @@
'mpi/mpi_arm.c',
],
}],
+ [ 'target_arch=="ppc64le"', {
+ 'sources': [
+ 'chacha20poly1305-ppc.c',
+ 'chacha20-ppc64le.S',
+ ],
+ }]
],
}],
[ 'OS=="win"', {