# Copyright 2021- IBM Inc. All rights reserved # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # #=================================================================================== # Written by Danny Tsen # # Poly1305 - this version mainly using vector/VSX/Scalar # - 26 bits limbs # - Handle multiple 64 byte blcoks but need at least 2 64 bytes block # # Improve performance by breaking down polynominal to the sum of products with # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r # # 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 # to 9 vectors for multiplications. # # setup r^4, r^3, r^2, r vectors # vs [r^1, r^3, r^2, r^4] # vs0 = [r0,.....] # vs1 = [r1,.....] # vs2 = [r2,.....] # vs3 = [r3,.....] # vs4 = [r4,.....] # vs5 = [r1*5,...] # vs6 = [r2*5,...] # vs7 = [r2*5,...] # vs8 = [r4*5,...] # # Each word in a vector consists a member of a "r/s" in [a * r/s]. # # r0, r4*5, r3*5, r2*5, r1*5; # r1, r0, r4*5, r3*5, r2*5; # r2, r1, r0, r4*5, r3*5; # r3, r2, r1, r0, r4*5; # r4, r3, r2, r1, r0 ; # # # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) # k = 32 bytes key # r3 = k (r, s) # r4 = mlen # r5 = m # .text # Block size 16 bytes # key = (r, s) # clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF # p = 2^130 - 5 # a += m # a = (r + a) % p # a += s # 16 bytes (a) # # p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; # p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; # p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; # p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; # p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; # # [r^2, r^3, r^1, r^4] # [m3, m2, m4, m1] # # multiply odd and even words .macro mul_odd vmulouw 14, 4, 26 vmulouw 10, 5, 3 vmulouw 11, 6, 2 vmulouw 12, 7, 1 vmulouw 13, 8, 0 vmulouw 15, 4, 27 vaddudm 14, 14, 10 vaddudm 14, 14, 11 vmulouw 10, 5, 26 vmulouw 11, 6, 3 vaddudm 14, 14, 12 vaddudm 14, 14, 13 # x0 vaddudm 15, 15, 10 vaddudm 15, 15, 11 vmulouw 12, 7, 2 vmulouw 13, 8, 1 vaddudm 15, 15, 12 vaddudm 15, 15, 13 # x1 vmulouw 16, 4, 28 vmulouw 10, 5, 27 vmulouw 11, 6, 26 vaddudm 16, 16, 10 vaddudm 16, 16, 11 vmulouw 12, 7, 3 vmulouw 13, 8, 2 vaddudm 16, 16, 12 vaddudm 16, 16, 13 # x2 vmulouw 17, 4, 29 vmulouw 10, 5, 28 vmulouw 11, 6, 27 vaddudm 17, 17, 10 vaddudm 17, 17, 11 vmulouw 12, 7, 26 vmulouw 13, 8, 3 vaddudm 17, 17, 12 vaddudm 17, 17, 13 # x3 vmulouw 18, 4, 30 vmulouw 10, 5, 29 vmulouw 11, 6, 28 vaddudm 18, 18, 10 vaddudm 18, 18, 11 vmulouw 12, 7, 27 vmulouw 13, 8, 26 vaddudm 18, 18, 12 vaddudm 18, 18, 13 # x4 .endm .macro mul_even vmuleuw 9, 4, 26 vmuleuw 10, 5, 3 vmuleuw 11, 6, 2 vmuleuw 12, 7, 1 vmuleuw 13, 8, 0 vaddudm 14, 14, 9 vaddudm 14, 14, 10 vaddudm 14, 14, 11 vaddudm 14, 14, 12 vaddudm 14, 14, 13 # x0 vmuleuw 9, 4, 27 vmuleuw 10, 5, 26 vmuleuw 11, 6, 3 vmuleuw 12, 7, 2 vmuleuw 13, 8, 1 vaddudm 15, 15, 9 vaddudm 15, 15, 10 vaddudm 15, 15, 11 vaddudm 15, 15, 12 vaddudm 15, 15, 13 # x1 vmuleuw 9, 4, 28 vmuleuw 10, 5, 27 vmuleuw 11, 6, 26 vmuleuw 12, 7, 3 vmuleuw 13, 8, 2 vaddudm 16, 16, 9 vaddudm 16, 16, 10 vaddudm 16, 16, 11 vaddudm 16, 16, 12 vaddudm 16, 16, 13 # x2 vmuleuw 9, 4, 29 vmuleuw 10, 5, 28 vmuleuw 11, 6, 27 vmuleuw 12, 7, 26 vmuleuw 13, 8, 3 vaddudm 17, 17, 9 vaddudm 17, 17, 10 vaddudm 17, 17, 11 vaddudm 17, 17, 12 vaddudm 17, 17, 13 # x3 vmuleuw 9, 4, 30 vmuleuw 10, 5, 29 vmuleuw 11, 6, 28 vmuleuw 12, 7, 27 vmuleuw 13, 8, 26 vaddudm 18, 18, 9 vaddudm 18, 18, 10 vaddudm 18, 18, 11 vaddudm 18, 18, 12 vaddudm 18, 18, 13 # x4 .endm # setup r^4, r^3, r^2, r vectors # [r, r^3, r^2, r^4] # vs0 = [r0,...] # vs1 = [r1,...] # vs2 = [r2,...] # vs3 = [r3,...] # vs4 = [r4,...] # vs5 = [r4*5,...] # vs6 = [r3*5,...] # vs7 = [r2*5,...] # vs8 = [r1*5,...] # # r0, r4*5, r3*5, r2*5, r1*5; # r1, r0, r4*5, r3*5, r2*5; # r2, r1, r0, r4*5, r3*5; # r3, r2, r1, r0, r4*5; # r4, r3, r2, r1, r0 ; # .macro poly1305_setup_r # save r xxlor 26, 58, 58 xxlor 27, 59, 59 xxlor 28, 60, 60 xxlor 29, 61, 61 xxlor 30, 62, 62 xxlxor 31, 31, 31 # [r, r^3, r^2, r^4] # compute r^2 vmr 4, 26 vmr 5, 27 vmr 6, 28 vmr 7, 29 vmr 8, 30 bl do_mul # r^2 r^1 xxpermdi 58, 58, 36, 0x3 # r0 xxpermdi 59, 59, 37, 0x3 # r1 xxpermdi 60, 60, 38, 0x3 # r2 xxpermdi 61, 61, 39, 0x3 # r3 xxpermdi 62, 62, 40, 0x3 # r4 xxpermdi 36, 36, 36, 0x3 xxpermdi 37, 37, 37, 0x3 xxpermdi 38, 38, 38, 0x3 xxpermdi 39, 39, 39, 0x3 xxpermdi 40, 40, 40, 0x3 vspltisb 13, 2 vsld 9, 27, 13 vsld 10, 28, 13 vsld 11, 29, 13 vsld 12, 30, 13 vaddudm 0, 9, 27 vaddudm 1, 10, 28 vaddudm 2, 11, 29 vaddudm 3, 12, 30 bl do_mul # r^4 r^3 vmrgow 26, 26, 4 vmrgow 27, 27, 5 vmrgow 28, 28, 6 vmrgow 29, 29, 7 vmrgow 30, 30, 8 vspltisb 13, 2 vsld 9, 27, 13 vsld 10, 28, 13 vsld 11, 29, 13 vsld 12, 30, 13 vaddudm 0, 9, 27 vaddudm 1, 10, 28 vaddudm 2, 11, 29 vaddudm 3, 12, 30 # r^2 r^4 xxlor 0, 58, 58 xxlor 1, 59, 59 xxlor 2, 60, 60 xxlor 3, 61, 61 xxlor 4, 62, 62 xxlor 5, 32, 32 xxlor 6, 33, 33 xxlor 7, 34, 34 xxlor 8, 35, 35 vspltw 9, 26, 3 vspltw 10, 26, 2 vmrgow 26, 10, 9 vspltw 9, 27, 3 vspltw 10, 27, 2 vmrgow 27, 10, 9 vspltw 9, 28, 3 vspltw 10, 28, 2 vmrgow 28, 10, 9 vspltw 9, 29, 3 vspltw 10, 29, 2 vmrgow 29, 10, 9 vspltw 9, 30, 3 vspltw 10, 30, 2 vmrgow 30, 10, 9 vsld 9, 27, 13 vsld 10, 28, 13 vsld 11, 29, 13 vsld 12, 30, 13 vaddudm 0, 9, 27 vaddudm 1, 10, 28 vaddudm 2, 11, 29 vaddudm 3, 12, 30 .endm do_mul: mul_odd # do reduction ( h %= p ) # carry reduction vspltisb 9, 2 vsrd 10, 14, 31 vsrd 11, 17, 31 vand 7, 17, 25 vand 4, 14, 25 vaddudm 18, 18, 11 vsrd 12, 18, 31 vaddudm 15, 15, 10 vsrd 11, 15, 31 vand 8, 18, 25 vand 5, 15, 25 vaddudm 4, 4, 12 vsld 10, 12, 9 vaddudm 6, 16, 11 vsrd 13, 6, 31 vand 6, 6, 25 vaddudm 4, 4, 10 vsrd 10, 4, 31 vaddudm 7, 7, 13 vsrd 11, 7, 31 vand 7, 7, 25 vand 4, 4, 25 vaddudm 5, 5, 10 vaddudm 8, 8, 11 blr # # init key # do_poly1305_init: ld 10, rmask@got(2) ld 11, 0(10) ld 12, 8(10) li 14, 16 li 15, 32 ld 10, cnum@got(2) lvx 25, 0, 10 # v25 - mask lvx 31, 14, 10 # v31 = 1a lvx 19, 15, 10 # v19 = 1 << 24 lxv 24, 48(10) # vs24 lxv 25, 64(10) # vs25 # initialize # load key from r3 to vectors ld 9, 16(3) ld 10, 24(3) ld 11, 0(3) ld 12, 8(3) # break 26 bits extrdi 14, 9, 26, 38 extrdi 15, 9, 26, 12 extrdi 16, 9, 12, 0 mtvsrdd 58, 0, 14 insrdi 16, 10, 14, 38 mtvsrdd 59, 0, 15 extrdi 17, 10, 26, 24 mtvsrdd 60, 0, 16 extrdi 18, 10, 24, 0 mtvsrdd 61, 0, 17 mtvsrdd 62, 0, 18 # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 li 9, 5 mtvsrdd 36, 0, 9 vmulouw 0, 27, 4 # v0 = rr0 vmulouw 1, 28, 4 # v1 = rr1 vmulouw 2, 29, 4 # v2 = rr2 vmulouw 3, 30, 4 # v3 = rr3 blr # # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) # k = 32 bytes key # r3 = k (r, s) # r4 = mlen # r5 = m # .global gcry_poly1305_p10le_4blocks .align 5 gcry_poly1305_p10le_4blocks: _gcry_poly1305_p10le_4blocks: cmpdi 5, 128 blt Out_no_poly1305 stdu 1,-1024(1) mflr 0 std 14,112(1) std 15,120(1) std 16,128(1) std 17,136(1) std 18,144(1) std 19,152(1) std 20,160(1) std 21,168(1) std 31,248(1) li 14, 256 stvx 20, 14, 1 addi 14, 14, 16 stvx 21, 14, 1 addi 14, 14, 16 stvx 22, 14, 1 addi 14, 14, 16 stvx 23, 14, 1 addi 14, 14, 16 stvx 24, 14, 1 addi 14, 14, 16 stvx 25, 14, 1 addi 14, 14, 16 stvx 26, 14, 1 addi 14, 14, 16 stvx 27, 14, 1 addi 14, 14, 16 stvx 28, 14, 1 addi 14, 14, 16 stvx 29, 14, 1 addi 14, 14, 16 stvx 30, 14, 1 addi 14, 14, 16 stvx 31, 14, 1 addi 14, 14, 16 stxvx 14, 14, 1 addi 14, 14, 16 stxvx 15, 14, 1 addi 14, 14, 16 stxvx 16, 14, 1 addi 14, 14, 16 stxvx 17, 14, 1 addi 14, 14, 16 stxvx 18, 14, 1 addi 14, 14, 16 stxvx 19, 14, 1 addi 14, 14, 16 stxvx 20, 14, 1 addi 14, 14, 16 stxvx 21, 14, 1 addi 14, 14, 16 stxvx 22, 14, 1 addi 14, 14, 16 stxvx 23, 14, 1 addi 14, 14, 16 stxvx 24, 14, 1 addi 14, 14, 16 stxvx 25, 14, 1 addi 14, 14, 16 stxvx 26, 14, 1 addi 14, 14, 16 stxvx 27, 14, 1 addi 14, 14, 16 stxvx 28, 14, 1 addi 14, 14, 16 stxvx 29, 14, 1 addi 14, 14, 16 stxvx 30, 14, 1 addi 14, 14, 16 stxvx 31, 14, 1 std 0, 1040(1) bl do_poly1305_init li 21, 0 # counter to message poly1305_setup_r # load previous state # break/convert r6 to 26 bits ld 9, 32(3) ld 10, 40(3) lwz 19, 48(3) sldi 19, 19, 24 mtvsrdd 41, 0, 19 extrdi 14, 9, 26, 38 extrdi 15, 9, 26, 12 extrdi 16, 9, 12, 0 mtvsrdd 36, 0, 14 insrdi 16, 10, 14, 38 mtvsrdd 37, 0, 15 extrdi 17, 10, 26, 24 mtvsrdd 38, 0, 16 extrdi 18, 10, 24, 0 mtvsrdd 39, 0, 17 mtvsrdd 40, 0, 18 vor 8, 8, 9 # input m1 m2 add 20, 4, 21 xxlor 49, 24, 24 xxlor 50, 25, 25 lxvw4x 43, 0, 20 addi 17, 20, 16 lxvw4x 44, 0, 17 vperm 14, 11, 12, 17 vperm 15, 11, 12, 18 vand 9, 14, 25 # a0 vsrd 10, 14, 31 # >> 26 vsrd 11, 10, 31 # 12 bits left vand 10, 10, 25 # a1 vspltisb 13, 12 vand 16, 15, 25 vsld 12, 16, 13 vor 11, 11, 12 vand 11, 11, 25 # a2 vspltisb 13, 14 vsrd 12, 15, 13 # >> 14 vsrd 13, 12, 31 # >> 26, a4 vand 12, 12, 25 # a3 vaddudm 20, 4, 9 vaddudm 21, 5, 10 vaddudm 22, 6, 11 vaddudm 23, 7, 12 vaddudm 24, 8, 13 # m3 m4 addi 17, 17, 16 lxvw4x 43, 0, 17 addi 17, 17, 16 lxvw4x 44, 0, 17 vperm 14, 11, 12, 17 vperm 15, 11, 12, 18 vand 9, 14, 25 # a0 vsrd 10, 14, 31 # >> 26 vsrd 11, 10, 31 # 12 bits left vand 10, 10, 25 # a1 vspltisb 13, 12 vand 16, 15, 25 vsld 12, 16, 13 vspltisb 13, 14 vor 11, 11, 12 vand 11, 11, 25 # a2 vsrd 12, 15, 13 # >> 14 vsrd 13, 12, 31 # >> 26, a4 vand 12, 12, 25 # a3 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] vmrgow 4, 9, 20 vmrgow 5, 10, 21 vmrgow 6, 11, 22 vmrgow 7, 12, 23 vmrgow 8, 13, 24 vaddudm 8, 8, 19 addi 5, 5, -64 addi 21, 21, 64 li 9, 64 divdu 31, 5, 9 mtctr 31 # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r # Rewrite the polynominal sum of product as follows, # h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 # h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 # .... Repeat # h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> # h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r # loop_4blocks: # Multiply odd words and even words mul_odd mul_even # carry reduction vspltisb 9, 2 vsrd 10, 14, 31 vsrd 11, 17, 31 vand 7, 17, 25 vand 4, 14, 25 vaddudm 18, 18, 11 vsrd 12, 18, 31 vaddudm 15, 15, 10 vsrd 11, 15, 31 vand 8, 18, 25 vand 5, 15, 25 vaddudm 4, 4, 12 vsld 10, 12, 9 vaddudm 6, 16, 11 vsrd 13, 6, 31 vand 6, 6, 25 vaddudm 4, 4, 10 vsrd 10, 4, 31 vaddudm 7, 7, 13 vsrd 11, 7, 31 vand 7, 7, 25 vand 4, 4, 25 vaddudm 5, 5, 10 vaddudm 8, 8, 11 # input m1 m2 m3 m4 add 20, 4, 21 xxlor 49, 24, 24 xxlor 50, 25, 25 lxvw4x 43, 0, 20 addi 17, 20, 16 lxvw4x 44, 0, 17 vperm 14, 11, 12, 17 vperm 15, 11, 12, 18 addi 17, 17, 16 lxvw4x 43, 0, 17 addi 17, 17, 16 lxvw4x 44, 0, 17 vperm 17, 11, 12, 17 vperm 18, 11, 12, 18 vand 20, 14, 25 # a0 vand 9, 17, 25 # a0 vsrd 21, 14, 31 # >> 26 vsrd 22, 21, 31 # 12 bits left vsrd 10, 17, 31 # >> 26 vsrd 11, 10, 31 # 12 bits left vand 21, 21, 25 # a1 vand 10, 10, 25 # a1 vspltisb 13, 12 vand 16, 15, 25 vsld 23, 16, 13 vor 22, 22, 23 vand 22, 22, 25 # a2 vand 16, 18, 25 vsld 12, 16, 13 vor 11, 11, 12 vand 11, 11, 25 # a2 vspltisb 13, 14 vsrd 23, 15, 13 # >> 14 vsrd 24, 23, 31 # >> 26, a4 vand 23, 23, 25 # a3 vsrd 12, 18, 13 # >> 14 vsrd 13, 12, 31 # >> 26, a4 vand 12, 12, 25 # a3 vaddudm 4, 4, 20 vaddudm 5, 5, 21 vaddudm 6, 6, 22 vaddudm 7, 7, 23 vaddudm 8, 8, 24 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] vmrgow 4, 9, 4 vmrgow 5, 10, 5 vmrgow 6, 11, 6 vmrgow 7, 12, 7 vmrgow 8, 13, 8 vaddudm 8, 8, 19 addi 5, 5, -64 addi 21, 21, 64 bdnz loop_4blocks xxlor 58, 0, 0 xxlor 59, 1, 1 xxlor 60, 2, 2 xxlor 61, 3, 3 xxlor 62, 4, 4 xxlor 32, 5, 5 xxlor 33, 6, 6 xxlor 34, 7, 7 xxlor 35, 8, 8 # Multiply odd words and even words mul_odd mul_even # Sum the products. xxpermdi 41, 31, 46, 0 xxpermdi 42, 31, 47, 0 vaddudm 4, 14, 9 xxpermdi 36, 31, 36, 3 vaddudm 5, 15, 10 xxpermdi 37, 31, 37, 3 xxpermdi 43, 31, 48, 0 vaddudm 6, 16, 11 xxpermdi 38, 31, 38, 3 xxpermdi 44, 31, 49, 0 vaddudm 7, 17, 12 xxpermdi 39, 31, 39, 3 xxpermdi 45, 31, 50, 0 vaddudm 8, 18, 13 xxpermdi 40, 31, 40, 3 # carry reduction vspltisb 9, 2 vsrd 10, 4, 31 vsrd 11, 7, 31 vand 7, 7, 25 vand 4, 4, 25 vaddudm 8, 8, 11 vsrd 12, 8, 31 vaddudm 5, 5, 10 vsrd 11, 5, 31 vand 8, 8, 25 vand 5, 5, 25 vaddudm 4, 4, 12 vsld 10, 12, 9 vaddudm 6, 6, 11 vsrd 13, 6, 31 vand 6, 6, 25 vaddudm 4, 4, 10 vsrd 10, 4, 31 vaddudm 7, 7, 13 vsrd 11, 7, 31 vand 7, 7, 25 vand 4, 4, 25 vaddudm 5, 5, 10 vaddudm 8, 8, 11 b do_final_update do_final_update: # v4, v5, v6, v7 and v8 are 26 bit vectors vsld 5, 5, 31 vor 20, 4, 5 vspltisb 11, 12 vsrd 12, 6, 11 vsld 6, 6, 31 vsld 6, 6, 31 vor 20, 20, 6 vspltisb 11, 14 vsld 7, 7, 11 vor 21, 7, 12 mfvsrld 16, 40 # save last 2 bytes vsld 8, 8, 11 vsld 8, 8, 31 vor 21, 21, 8 mfvsrld 17, 52 mfvsrld 19, 53 srdi 16, 16, 24 std 17, 32(3) std 19, 40(3) stw 16, 48(3) Out_loop: li 3, 0 li 14, 256 lvx 20, 14, 1 addi 14, 14, 16 lvx 21, 14, 1 addi 14, 14, 16 lvx 22, 14, 1 addi 14, 14, 16 lvx 23, 14, 1 addi 14, 14, 16 lvx 24, 14, 1 addi 14, 14, 16 lvx 25, 14, 1 addi 14, 14, 16 lvx 26, 14, 1 addi 14, 14, 16 lvx 27, 14, 1 addi 14, 14, 16 lvx 28, 14, 1 addi 14, 14, 16 lvx 29, 14, 1 addi 14, 14, 16 lvx 30, 14, 1 addi 14, 14, 16 lvx 31, 14, 1 addi 14, 14, 16 lxvx 14, 14, 1 addi 14, 14, 16 lxvx 15, 14, 1 addi 14, 14, 16 lxvx 16, 14, 1 addi 14, 14, 16 lxvx 17, 14, 1 addi 14, 14, 16 lxvx 18, 14, 1 addi 14, 14, 16 lxvx 19, 14, 1 addi 14, 14, 16 lxvx 20, 14, 1 addi 14, 14, 16 lxvx 21, 14, 1 addi 14, 14, 16 lxvx 22, 14, 1 addi 14, 14, 16 lxvx 23, 14, 1 addi 14, 14, 16 lxvx 24, 14, 1 addi 14, 14, 16 lxvx 25, 14, 1 addi 14, 14, 16 lxvx 26, 14, 1 addi 14, 14, 16 lxvx 27, 14, 1 addi 14, 14, 16 lxvx 28, 14, 1 addi 14, 14, 16 lxvx 29, 14, 1 addi 14, 14, 16 lxvx 30, 14, 1 addi 14, 14, 16 lxvx 31, 14, 1 ld 0, 1040(1) ld 14,112(1) ld 15,120(1) ld 16,128(1) ld 17,136(1) ld 18,144(1) ld 19,152(1) ld 20,160(1) ld 21,168(1) ld 31,248(1) mtlr 0 addi 1, 1, 1024 blr Out_no_poly1305: li 3, 0 blr .data .align 5 rmask: .byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f cnum: .long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 .long 0x1a, 0x00, 0x1a, 0x00 .long 0x01000000, 0x01000000, 0x01000000, 0x01000000 .long 0x00010203, 0x04050607, 0x10111213, 0x14151617 .long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f .long 0x05, 0x00, 0x00, 0x00 .long 0x02020202, 0x02020202, 0x02020202, 0x02020202 .long 0xffffffff, 0xffffffff, 0x00000000, 0x00000000