diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-12-30 17:46:07 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-12-30 17:46:07 +0200 |
commit | 1f75681cbba895ea2f7ea0637900721f4522e729 (patch) | |
tree | 19eb7a48b5513f9f5811b1e515a3d4c8e637641c /cipher/chacha20-s390x.S | |
parent | 6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (diff) | |
download | libgcrypt-1f75681cbba895ea2f7ea0637900721f4522e729.tar.gz |
Add s390x/zSeries implementation of Poly1305cipher-s390x-optimizations
* cipher/Makefile.am: Add 'poly1305-s390x.S' and
'asm-poly1305-s390x.h'.
* cipher/asm-poly1305-s390x.h: New
* cipher/chacha20-s390x.S (_gcry_chacha20_poly1305_s390x_vx_blocks8)
(_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New, stitched
chacha20-poly1305 implementation.
* cipher/chacha20.c (USE_S390X_VX_POLY1305): New.
(_gcry_chacha20_poly1305_s390x_vx_blocks8)
(_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New prototypes.
(_gcry_chacha20_poly1305_encrypt, _gcry_chacha20_poly1305_decrypt): Add
s390x/VX stitched chacha20-poly1305 code-path.
* cipher/poly1305-s390x.S: New.
* cipher/poly1305.c (USE_S390X_ASM, HAVE_ASM_POLY1305_BLOCKS): New.
[USE_S390X_ASM] (_gcry_poly1305_s390x_blocks1, poly1305_blocks): New.
* configure.ac (gcry_cv_gcc_inline_asm_s390x): Check for 'risbgn' and
'algrk' instructions.
* tests/basic.c (_check_poly1305_cipher): Add large chacha20-poly1305
test vector.
--
Patch adds Poly1305 and stitched ChaCha20-Poly1305 implementation
for zSeries. Stitched implementation interleaves ChaCha20 and Poly1305
processing for higher instruction level parallelism and better
utilization of execution units.
Benchmark on z15 (4504 Mhz):
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.16 ns/B 823.2 MiB/s 5.22 c/B
POLY1305 dec | 1.16 ns/B 823.2 MiB/s 5.22 c/B
POLY1305 auth | 0.736 ns/B 1295 MiB/s 3.32 c/B
After (chacha20-poly1305 ~71% faster, poly1305 ~29% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.677 ns/B 1409 MiB/s 3.05 c/B
POLY1305 dec | 0.655 ns/B 1456 MiB/s 2.95 c/B
POLY1305 auth | 0.569 ns/B 1675 MiB/s 2.56 c/B
GnuPG-bug-id: 5202
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20-s390x.S')
-rw-r--r-- | cipher/chacha20-s390x.S | 673 |
1 files changed, 673 insertions, 0 deletions
diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S index 2cd38330..9b1d59c6 100644 --- a/cipher/chacha20-s390x.S +++ b/cipher/chacha20-s390x.S @@ -23,6 +23,7 @@ #if defined(HAVE_GCC_INLINE_ASM_S390X_VX) #include "asm-common-s390x.h" +#include "asm-poly1305-s390x.h" .machine "z13+vx" .text @@ -574,6 +575,393 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1, .-_gcry_chacha20_s390x_vx_blocks4_2_1;) /********************************************************************** + 4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal") + **********************************************************************/ + +.balign 8 +.globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1 +ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;) + +_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks + * %r6: poly1305 state + * 160(%r15): poly1305 src + */ + CFI_STARTPROC(); + + START_STACK(%r14); + lgr NBLKS, %r5; + + /* Load constants. */ + larl %r8, .Lconsts; + vl TMP0, (.Lwordswap - .Lconsts)(%r8); + vl TMP1, (.Lone - .Lconsts)(%r8); + vl TMP2, (.Lbswap128 - .Lconsts)(%r8); + + /* Load state. */ + vlm S0, S3, 0(INPUT); + vperm S0, S0, S0, TMP0; + vperm S1, S1, S1, TMP0; + vperm S2, S2, S2, TMP0; + vperm S3, S3, S3, TMP0; + + /* Store parameters to stack. */ + stmg %r2, %r6, STACK_INPUT(%r15); + + lgr POLY_RSTATE, %r6; + lgr NBLKS, %r5; + + lg POLY_RSRC, 0(%r15); + lg POLY_RSRC, 160(POLY_RSRC); + stg POLY_RSRC, STACK_POSRC(%r15); + + /* Load poly1305 state */ + POLY1305_LOAD_STATE(); + + clgijl NBLKS, 4, .Lloop2_poly; + +.balign 4 +.Lloop4_poly: + /* Process four chacha20 blocks and 16 poly1305 blocks. */ + vlr TMP3, S3; + lghi ROUND, (20 / 4); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, TMP3; + vag TMP3, TMP3, TMP1; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vlr B3, TMP3; + vag TMP3, TMP3, TMP1; + vlr C0, S0; + vlr C1, S1; + vlr C2, S2; + vlr C3, TMP3; + vlr D0, S0; + vlr D1, S1; + vlr D2, S2; + vag D3, TMP3, TMP1; + + slgfi NBLKS, 4; + +.balign 4 +.Lround4_4_poly: + /* Total 15 poly1305 blocks processed by this loop. */ + QUARTERROUND4_4_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6()); + QUARTERROUND4_4_POLY(1, 2, 3, + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(1 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_4_POLY(3, 2, 1, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(2 * 16); + INC_POLY1305_SRC(3 * 16), + POLY1305_BLOCK_PART2()); + QUARTERROUND4_4_POLY(1, 2, 3, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround4_4_poly; + + POLY1305_BLOCK_PART1(0 * 16); + INC_POLY1305_SRC(1 * 16); + stg POLY_RSRC, STACK_POSRC(%r15); + + lg %r14, STACK_SRC(%r15); + vlm IO0, IO7, 0(%r14); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART2(); + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART3(); + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + POLY1305_BLOCK_PART4(); + PLUS(C0, S0); + PLUS(C1, S1); + PLUS(C2, S2); + PLUS(C3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(D0, S0); + PLUS(D1, S1); + PLUS(D2, S2); + PLUS(D3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART5(); + vperm C0, C0, C0, TMP2; + vperm C1, C1, C1, TMP2; + vperm C2, C2, C2, TMP2; + vperm C3, C3, C3, TMP2; + vperm D0, D0, D0, TMP2; + vperm D1, D1, D1, TMP2; + vperm D2, D2, D2, TMP2; + vperm D3, D3, D3, TMP2; + + POLY1305_BLOCK_PART6(); + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vlm A0, B3, 128(%r14); + aghi %r14, 256; + stg %r14, STACK_SRC(%r15); + + lg %r14, STACK_DST(%r15); + POLY1305_BLOCK_PART7(); + vstm IO0, IO7, 0(%r14); + XOR(A0, C0); + XOR(A1, C1); + XOR(A2, C2); + XOR(A3, C3); + XOR(B0, D0); + XOR(B1, D1); + XOR(B2, D2); + XOR(B3, D3); + POLY1305_BLOCK_PART8(); + vstm A0, B3, 128(%r14); + aghi %r14, 256; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 4, .Lloop4_poly; + + CLEAR(C0); + CLEAR(C1); + CLEAR(C2); + CLEAR(C3); + CLEAR(D0); + CLEAR(D1); + CLEAR(D2); + CLEAR(D3); + +.balign 4 +.Lloop2_poly: + clgijl NBLKS, 2, .Lloop1_poly; + + /* Process two chacha20 and eight poly1305 blocks. */ + lghi ROUND, ((20 - 4) / 2); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vag B3, S3, TMP1; + + slgfi NBLKS, 2; + +.balign 4 +.Lround4_2_poly: + /* Total eight poly1305 blocks processed by this loop. */ + QUARTERROUND4_2_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + INC_POLY1305_SRC(1 * 16); + QUARTERROUND4_2_POLY(1, 2, 3, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround4_2_poly; + + stg POLY_RSRC, STACK_POSRC(%r15); + lg %r14, STACK_SRC(%r15); + + QUARTERROUND4_2(3, 2, 1); + QUARTERROUND4_2(1, 2, 3); + QUARTERROUND4_2(3, 2, 1); + QUARTERROUND4_2(1, 2, 3); + + vlm IO0, IO7, 0(%r14); + aghi %r14, 128; + stg %r14, STACK_SRC(%r15); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + + lg %r14, STACK_DST(%r15); + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vstm IO0, IO7, 0(%r14); + aghi %r14, 128; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 2, .Lloop2_poly; + + CLEAR(B0); + CLEAR(B1); + CLEAR(B2); + CLEAR(B3); + +.balign 4 +.Lloop1_poly: + clgijl NBLKS, 1, .Ldone_poly; + + /* Process one chacha20 block and four poly1305 blocks.*/ + lghi ROUND, ((20 - 4) / 4); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + + slgfi NBLKS, 1; + +.balign 4 +.Lround4_1_poly: + /* Total four poly1305 blocks processed by this loop. */ + QUARTERROUND4_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2()); + INC_POLY1305_SRC(1 * 16); + QUARTERROUND4_POLY(1, 2, 3, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_POLY(3, 2, 1, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6()); + QUARTERROUND4_POLY(1, 2, 3, + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brct ROUND, .Lround4_1_poly; + + stg POLY_RSRC, STACK_POSRC(%r15); + lg %r14, STACK_SRC(%r15); + + QUARTERROUND4(3, 2, 1); + QUARTERROUND4(1, 2, 3); + QUARTERROUND4(3, 2, 1); + QUARTERROUND4(1, 2, 3); + + vlm IO0, IO3, 0(%r14); + aghi %r14, 64; + stg %r14, STACK_SRC(%r15); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + + lg %r14, STACK_DST(%r15); + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + vstm IO0, IO3, 0(%r14); + aghi %r14, 64; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 1, .Lloop1_poly; + +.balign 4 +.Ldone_poly: + /* Store poly1305 state */ + lg POLY_RSTATE, STACK_POCTX(%r15); + POLY1305_STORE_STATE(); + + /* Store counter. */ + lg INPUT, STACK_INPUT(%r15); + vperm S3, S3, S3, TMP0; + vst S3, (48)(INPUT); + + /* Clear the used vector registers. */ + CLEAR(A0); + CLEAR(A1); + CLEAR(A2); + CLEAR(A3); + CLEAR(IO0); + CLEAR(IO1); + CLEAR(IO2); + CLEAR(IO3); + CLEAR(IO4); + CLEAR(IO5); + CLEAR(IO6); + CLEAR(IO7); + CLEAR(TMP0); + CLEAR(TMP1); + CLEAR(TMP2); + + END_STACK(%r14); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1, + .-_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1;) + +/********************************************************************** 8-way chacha20 ("vertical") **********************************************************************/ @@ -884,5 +1272,290 @@ _gcry_chacha20_s390x_vx_blocks8: ELF(.size _gcry_chacha20_s390x_vx_blocks8, .-_gcry_chacha20_s390x_vx_blocks8;) +/********************************************************************** + 8-way stitched chacha20-poly1305 ("vertical") + **********************************************************************/ + +.balign 8 +.globl _gcry_chacha20_poly1305_s390x_vx_blocks8 +ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;) + +_gcry_chacha20_poly1305_s390x_vx_blocks8: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks (multiple of 8) + * %r6: poly1305 state + * 160(%r15): poly1305 src + */ + CFI_STARTPROC(); + + START_STACK(%r14); + + /* Store parameters to stack. */ + stmg %r2, %r6, STACK_INPUT(%r15); + + lgr POLY_RSTATE, %r6; + lgr NBLKS, %r5; + + lg POLY_RSRC, 0(%r15); + lg POLY_RSRC, 160(POLY_RSRC); + stg POLY_RSRC, STACK_POSRC(%r15); + + /* Load poly1305 state */ + POLY1305_LOAD_STATE(); + +.balign 4 + /* Process eight chacha20 blocks and 32 poly1305 blocks per loop. */ +.Lloop8_poly: + lg INPUT, STACK_INPUT(%r15); + larl %r8, .Lconsts; + + vlm Y0, Y3, 0(INPUT); + + slgfi NBLKS, 8; + lghi ROUND, (20 / 2); + + /* Construct counter vectors X12/X13 & Y12/Y13. */ + vl X4, (.Ladd_counter_0123 - .Lconsts)(%r8); + vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r8); + lg %r8, (12 * 4)(INPUT); /* Update counter. */ + vrepf Y12, Y3, 0; + vrepf Y13, Y3, 1; + vaccf X5, Y12, X4; + vaccf Y5, Y12, Y4; + vaf X12, Y12, X4; + vaf Y12, Y12, Y4; + vaf X13, Y13, X5; + vaf Y13, Y13, Y5; + rllg %r8, %r8, 32; + + vrepf X0, Y0, 0; + vrepf X1, Y0, 1; + vrepf X2, Y0, 2; + vrepf X3, Y0, 3; + vrepf X4, Y1, 0; + vrepf X5, Y1, 1; + vrepf X6, Y1, 2; + vrepf X7, Y1, 3; + vrepf X8, Y2, 0; + vrepf X9, Y2, 1; + vrepf X10, Y2, 2; + vrepf X11, Y2, 3; + vrepf X14, Y3, 2; + vrepf X15, Y3, 3; + agfi %r8, 8; + + /* Store counters for blocks 0-7. */ + vstm X12, X13, (STACK_CTR + 0 * 16)(%r15); + vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15); + rllg %r8, %r8, 32; + + vlr Y0, X0; + vlr Y1, X1; + vlr Y2, X2; + vlr Y3, X3; + vlr Y4, X4; + vlr Y5, X5; + vlr Y6, X6; + vlr Y7, X7; + vlr Y8, X8; + vlr Y9, X9; + vlr Y10, X10; + vlr Y11, X11; + vlr Y14, X14; + vlr Y15, X15; + stg %r8, (12 * 4)(INPUT); + +.balign 4 +.Lround2_8_poly: + /* Total 30 poly1305 blocks processed by this loop. */ + QUARTERROUND4_V8_POLY(X0, X4, X8, X12, X1, X5, X9, X13, + X2, X6, X10, X14, X3, X7, X11, X15, + Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13, + Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(1 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_V8_POLY(X0, X5, X10, X15, X1, X6, X11, X12, + X2, X7, X8, X13, X3, X4, X9, X14, + Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12, + Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(2 * 16); + INC_POLY1305_SRC(3 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround2_8_poly; + + POLY1305_BLOCK_PART1(0 * 16); + + /* Store blocks 4-7. */ + vstm Y0, Y15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 0-3. */ + vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15); + + stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */ + + lghi ROUND, 1; + j .Lfirst_output_4blks_8_poly; + +.balign 4 +.Lsecond_output_4blks_8_poly: + + POLY1305_BLOCK_PART1(1 * 16); + + /* Load blocks 4-7. */ + vlm X0, X15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 4-7. */ + vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15); + + INC_POLY1305_SRC(2 * 16); + stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */ + + lghi ROUND, 0; + +.balign 4 + /* Output four chacha20 blocks and one poly1305 block per loop. */ +.Lfirst_output_4blks_8_poly: + lg %r14, STACK_INPUT(%r15); + vlm Y12, Y15, 0(%r14); + POLY1305_BLOCK_PART2(); + PLUS(X12, Y0); + PLUS(X13, Y1); + vrepf Y0, Y12, 0; + vrepf Y1, Y12, 1; + vrepf Y2, Y12, 2; + vrepf Y3, Y12, 3; + vrepf Y4, Y13, 0; + vrepf Y5, Y13, 1; + vrepf Y6, Y13, 2; + vrepf Y7, Y13, 3; + vrepf Y8, Y14, 0; + vrepf Y9, Y14, 1; + vrepf Y10, Y14, 2; + vrepf Y11, Y14, 3; + vrepf Y14, Y15, 2; + vrepf Y15, Y15, 3; + POLY1305_BLOCK_PART3(); + PLUS(X0, Y0); + PLUS(X1, Y1); + PLUS(X2, Y2); + PLUS(X3, Y3); + PLUS(X4, Y4); + PLUS(X5, Y5); + PLUS(X6, Y6); + PLUS(X7, Y7); + PLUS(X8, Y8); + PLUS(X9, Y9); + PLUS(X10, Y10); + PLUS(X11, Y11); + PLUS(X14, Y14); + PLUS(X15, Y15); + POLY1305_BLOCK_PART4(); + + larl %r14, .Lconsts; + vl Y15, (.Lbswap32 - .Lconsts)(%r14); + TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7, + Y9, Y10, Y11, Y12, Y13, Y14); + lg %r14, STACK_SRC(%r15); + POLY1305_BLOCK_PART5(); + TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15, + Y9, Y10, Y11, Y12, Y13, Y14); + + vlm Y0, Y14, 0(%r14); + POLY1305_BLOCK_PART6(); + vperm X0, X0, X0, Y15; + vperm X1, X1, X1, Y15; + vperm X2, X2, X2, Y15; + vperm X3, X3, X3, Y15; + vperm X4, X4, X4, Y15; + vperm X5, X5, X5, Y15; + vperm X6, X6, X6, Y15; + vperm X7, X7, X7, Y15; + vperm X8, X8, X8, Y15; + vperm X9, X9, X9, Y15; + vperm X10, X10, X10, Y15; + vperm X11, X11, X11, Y15; + vperm X12, X12, X12, Y15; + vperm X13, X13, X13, Y15; + vperm X14, X14, X14, Y15; + vperm X15, X15, X15, Y15; + vl Y15, (15 * 16)(%r14); + POLY1305_BLOCK_PART7(); + + aghi %r14, 256; + stg %r14, STACK_SRC(%r15); + lg %r14, STACK_DST(%r15); + + XOR(Y0, X0); + XOR(Y1, X4); + XOR(Y2, X8); + XOR(Y3, X12); + XOR(Y4, X1); + XOR(Y5, X5); + XOR(Y6, X9); + XOR(Y7, X13); + XOR(Y8, X2); + XOR(Y9, X6); + XOR(Y10, X10); + XOR(Y11, X14); + XOR(Y12, X3); + XOR(Y13, X7); + XOR(Y14, X11); + XOR(Y15, X15); + POLY1305_BLOCK_PART8(); + vstm Y0, Y15, 0(%r14); + + aghi %r14, 256; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgije ROUND, 1, .Lsecond_output_4blks_8_poly; + + clgijhe NBLKS, 8, .Lloop8_poly; + + /* Store poly1305 state */ + lg POLY_RSTATE, STACK_POCTX(%r15); + POLY1305_STORE_STATE(); + + /* Clear the used vector registers */ + DST_8(CLEAR, 0, _); + DST_8(CLEAR, 1, _); + DST_8(CLEAR, 2, _); + DST_8(CLEAR, 3, _); + + /* Clear sensitive data in stack. */ + vlm Y0, Y15, STACK_Y0_Y15(%r15); + vlm Y0, Y3, STACK_CTR(%r15); + + END_STACK(%r14); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks8, + .-_gcry_chacha20_poly1305_s390x_vx_blocks8;) + #endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/ #endif /*__s390x__*/ |