summaryrefslogtreecommitdiff
path: root/sparc32
diff options
context:
space:
mode:
Diffstat (limited to 'sparc32')
-rw-r--r--sparc32/aes-decrypt-internal.asm132
-rw-r--r--sparc32/aes-encrypt-internal.asm156
-rw-r--r--sparc32/aes.m483
-rw-r--r--sparc32/arcfour-crypt.asm230
-rw-r--r--sparc32/machine.m40
5 files changed, 601 insertions, 0 deletions
diff --git a/sparc32/aes-decrypt-internal.asm b/sparc32/aes-decrypt-internal.asm
new file mode 100644
index 00000000..750e3d21
--- /dev/null
+++ b/sparc32/aes-decrypt-internal.asm
@@ -0,0 +1,132 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2002, 2005 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+include_src(<sparc/aes.m4>)
+
+C Arguments
+define(<CTX>, <%i0>)
+define(<T>, <%i1>)
+define(<LENGTH>,<%i2>)
+define(<DST>, <%i3>)
+define(<SRC>, <%i4>)
+
+C AES state, two copies for unrolling
+
+define(<W0>, <%l0>)
+define(<W1>, <%l1>)
+define(<W2>, <%l2>)
+define(<W3>, <%l3>)
+
+define(<X0>, <%l4>)
+define(<X1>, <%l5>)
+define(<X2>, <%l6>)
+define(<X3>, <%l7>)
+
+C %o0-%03 are used for loop invariants T0-T3
+define(<KEY>, <%o4>)
+define(<ROUND>, <%o5>)
+
+C %g1, %g2, %g3 are TMP1, TMP2 and TMP3
+
+C The sparc32 stack frame looks like
+C
+C %fp - 4: OS-dependent link field
+C %fp - 8: OS-dependent link field
+C %fp - 104: OS register save area.
+define(<FRAME_SIZE>, 104)
+
+ .file "aes-decrypt-internal.asm"
+
+ C _aes_decrypt(struct aes_context *ctx,
+ C const struct aes_table *T,
+ C unsigned length, uint8_t *dst,
+ C uint8_t *src)
+
+ .section ".text"
+ .align 16
+ .proc 020
+
+PROLOGUE(_nettle_aes_decrypt)
+
+ save %sp, -FRAME_SIZE, %sp
+ cmp LENGTH, 0
+ be .Lend
+
+ C Loop invariants
+ add T, AES_TABLE0, T0
+ add T, AES_TABLE1, T1
+ add T, AES_TABLE2, T2
+ add T, AES_TABLE3, T3
+
+.Lblock_loop:
+ C Read src, and add initial subkey
+ add CTX, AES_KEYS, KEY
+ AES_LOAD(0, SRC, KEY, W0)
+ AES_LOAD(1, SRC, KEY, W1)
+ AES_LOAD(2, SRC, KEY, W2)
+ AES_LOAD(3, SRC, KEY, W3)
+
+ C Must be even, and includes the final round
+ ld [AES_NROUNDS + CTX], ROUND
+ add SRC, 16, SRC
+ add KEY, 16, KEY
+
+ srl ROUND, 1, ROUND
+ C Last two rounds handled specially
+ sub ROUND, 1, ROUND
+.Lround_loop:
+ C The AES_ROUND macro uses T0,... T3
+ C Transform W -> X
+ AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
+ AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
+ AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
+ AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+
+ C Transform X -> W
+ AES_ROUND(4, X0, X3, X2, X1, KEY, W0)
+ AES_ROUND(5, X1, X0, X3, X2, KEY, W1)
+ AES_ROUND(6, X2, X1, X0, X3, KEY, W2)
+ AES_ROUND(7, X3, X2, X1, X0, KEY, W3)
+
+ subcc ROUND, 1, ROUND
+ bne .Lround_loop
+ add KEY, 32, KEY
+
+ C Penultimate round
+ AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
+ AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
+ AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
+ AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+
+ add KEY, 16, KEY
+ C Final round
+ AES_FINAL_ROUND(0, T, X0, X3, X2, X1, KEY, DST)
+ AES_FINAL_ROUND(1, T, X1, X0, X3, X2, KEY, DST)
+ AES_FINAL_ROUND(2, T, X2, X1, X0, X3, KEY, DST)
+ AES_FINAL_ROUND(3, T, X3, X2, X1, X0, KEY, DST)
+
+ subcc LENGTH, 16, LENGTH
+ bne .Lblock_loop
+ add DST, 16, DST
+
+.Lend:
+ ret
+ restore
+EPILOGUE(_nettle_aes_decrypt)
diff --git a/sparc32/aes-encrypt-internal.asm b/sparc32/aes-encrypt-internal.asm
new file mode 100644
index 00000000..92d6fc0e
--- /dev/null
+++ b/sparc32/aes-encrypt-internal.asm
@@ -0,0 +1,156 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2002, 2005 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+include_src(<sparc/aes.m4>)
+
+C Arguments
+define(<CTX>, <%i0>)
+define(<T>, <%i1>)
+define(<LENGTH>,<%i2>)
+define(<DST>, <%i3>)
+define(<SRC>, <%i4>)
+
+C AES state, two copies for unrolling
+
+define(<W0>, <%l0>)
+define(<W1>, <%l1>)
+define(<W2>, <%l2>)
+define(<W3>, <%l3>)
+
+define(<X0>, <%l4>)
+define(<X1>, <%l5>)
+define(<X2>, <%l6>)
+define(<X3>, <%l7>)
+
+C %o0-%03 are used for loop invariants T0-T3
+define(<KEY>, <%o4>)
+define(<ROUND>, <%o5>)
+
+C %g1, %g2, %g3 are TMP1, TMP2 and TMP3
+
+C I'm still slightly confused by the frame layout, specified in
+C "SYSTEM V APPLICATION BINARY INTERFACE SPARC Processor Supplement".
+C However, Sun's cc generates a 104 byte stack frame for a function
+C with no local variables, so that should be good enough for us too.
+
+C The sparc32 stack frame looks like
+C
+C %fp - 4: OS-dependent link field
+C %fp - 8: OS-dependent link field
+C %fp - 104: OS register save area
+define(<FRAME_SIZE>, 104)
+
+ .file "aes-encrypt-internal.asm"
+
+ C _aes_encrypt(struct aes_context *ctx,
+ C const struct aes_table *T,
+ C unsigned length, uint8_t *dst,
+ C uint8_t *src)
+
+ .section ".text"
+ .align 16
+ .proc 020
+
+PROLOGUE(_nettle_aes_encrypt)
+
+ save %sp, -FRAME_SIZE, %sp
+ cmp LENGTH, 0
+ be .Lend
+
+ C Loop invariants
+ add T, AES_TABLE0, T0
+ add T, AES_TABLE1, T1
+ add T, AES_TABLE2, T2
+ add T, AES_TABLE3, T3
+
+.Lblock_loop:
+ C Read src, and add initial subkey
+ add CTX, AES_KEYS, KEY
+ AES_LOAD(0, SRC, KEY, W0)
+ AES_LOAD(1, SRC, KEY, W1)
+ AES_LOAD(2, SRC, KEY, W2)
+ AES_LOAD(3, SRC, KEY, W3)
+
+ C Must be even, and includes the final round
+ ld [AES_NROUNDS + CTX], ROUND
+ add SRC, 16, SRC
+ add KEY, 16, KEY
+
+ srl ROUND, 1, ROUND
+ C Last two rounds handled specially
+ sub ROUND, 1, ROUND
+.Lround_loop:
+ C The AES_ROUND macro uses T0,... T3
+ C Transform W -> X
+ AES_ROUND(0, W0, W1, W2, W3, KEY, X0)
+ AES_ROUND(1, W1, W2, W3, W0, KEY, X1)
+ AES_ROUND(2, W2, W3, W0, W1, KEY, X2)
+ AES_ROUND(3, W3, W0, W1, W2, KEY, X3)
+
+ C Transform X -> W
+ AES_ROUND(4, X0, X1, X2, X3, KEY, W0)
+ AES_ROUND(5, X1, X2, X3, X0, KEY, W1)
+ AES_ROUND(6, X2, X3, X0, X1, KEY, W2)
+ AES_ROUND(7, X3, X0, X1, X2, KEY, W3)
+
+ subcc ROUND, 1, ROUND
+ bne .Lround_loop
+ add KEY, 32, KEY
+
+ C Penultimate round
+ AES_ROUND(0, W0, W1, W2, W3, KEY, X0)
+ AES_ROUND(1, W1, W2, W3, W0, KEY, X1)
+ AES_ROUND(2, W2, W3, W0, W1, KEY, X2)
+ AES_ROUND(3, W3, W0, W1, W2, KEY, X3)
+
+ add KEY, 16, KEY
+ C Final round
+ AES_FINAL_ROUND(0, T, X0, X1, X2, X3, KEY, DST)
+ AES_FINAL_ROUND(1, T, X1, X2, X3, X0, KEY, DST)
+ AES_FINAL_ROUND(2, T, X2, X3, X0, X1, KEY, DST)
+ AES_FINAL_ROUND(3, T, X3, X0, X1, X2, KEY, DST)
+
+ subcc LENGTH, 16, LENGTH
+ bne .Lblock_loop
+ add DST, 16, DST
+
+.Lend:
+ ret
+ restore
+EPILOGUE(_nettle_aes_encrypt)
+
+C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128
+
+C 1: nettle-1.13 C-code
+C 2: nettle-1.13 assembler
+C 3: New C-code
+C 4: New assembler, first correct version
+C 5: New assembler, with basic scheduling of AES_ROUND.
+C 6: New assembpler, with loop invariants T0-T3.
+C 7: New assembler, with basic scheduling also of AES_FINAL_ROUND.
+
+C MB/s cycles/block Code size (bytes)
+C 1 1.2 1107 592
+C 2 2.3 572 1032
+C 3 2.1 627
+C 4 1.8 722
+C 5 2.6 496
+C 6 3.0 437
+C 7 3.1 415 1448
diff --git a/sparc32/aes.m4 b/sparc32/aes.m4
new file mode 100644
index 00000000..05f465e0
--- /dev/null
+++ b/sparc32/aes.m4
@@ -0,0 +1,83 @@
+C Used as temporaries by the AES macros
+define(<TMP1>, <%g1>)
+define(<TMP2>, <%g2>)
+define(<TMP3>, <%g3>)
+
+C Loop invariants used by AES_ROUND
+define(<T0>, <%o0>)
+define(<T1>, <%o1>)
+define(<T2>, <%o2>)
+define(<T3>, <%o3>)
+
+C AES_LOAD(i, src, key, res)
+define(<AES_LOAD>, <
+ ldub [$2 + 4*$1], $4
+ ldub [$2 + 4*$1 + 1], TMP1
+ ldub [$2 + 4*$1 + 2], TMP2
+ sll TMP1, 8, TMP1
+
+ or $4, TMP1, $4
+ ldub [$2 + 4*$1+3], TMP1
+ sll TMP2, 16, TMP2
+ or $4, TMP2, $4
+
+ sll TMP1, 24, TMP1
+ C Get subkey
+ ld [$3 + 4*$1], TMP2
+ or $4, TMP1, $4
+ xor $4, TMP2, $4>)dnl
+
+C AES_ROUND(i, a, b, c, d, key, res)
+C Computes one word of the AES round
+C FIXME: Could use registers pointing directly to the four tables
+C FIXME: Needs better instruction scheduling, and perhaps more temporaries
+C Alternatively, we can use a single table and some rotations
+define(<AES_ROUND>, <
+ and $2, 0xff, TMP1 C 0
+ srl $3, 6, TMP2 C 1
+ sll TMP1, 2, TMP1 C 0
+ and TMP2, 0x3fc, TMP2 C 1
+ ld [T0 + TMP1], $7 C 0 E0
+ srl $4, 14, TMP1 C 2
+ ld [T1 + TMP2], TMP2 C 1
+ and TMP1, 0x3fc, TMP1 C 2
+ xor $7, TMP2, $7 C 1 E1
+ srl $5, 22, TMP2 C 3
+ ld [T2 + TMP1], TMP1 C 2
+ and TMP2, 0x3fc, TMP2 C 3
+ xor $7, TMP1, $7 C 2 E2
+ ld [$6 + 4*$1], TMP1 C 4
+ ld [T3 + TMP2], TMP2 C 3
+ xor $7, TMP1, $7 C 4 E4
+ xor $7, TMP2, $7 C 3 E3
+>)dnl
+
+C AES_FINAL_ROUND(i, T, a, b, c, d, key, dst)
+C Compute one word in the final round function. Output is converted to
+C octets and stored at dst. Relies on AES_SBOX being zero.
+define(<AES_FINAL_ROUND>, <
+ C Load subkey
+ ld [$7 + 4*$1], TMP3
+
+ and $3, 0xff, TMP1 C 0
+ srl $4, 8, TMP2 C 1
+ ldub [T + TMP1], TMP1 C 0
+ and TMP2, 0xff, TMP2 C 1
+ xor TMP3, TMP1, TMP1 C 0
+ ldub [T + TMP2], TMP2 C 1
+ stb TMP1, [$8 + 4*$1] C 0 E0
+ srl $5, 16, TMP1 C 2
+ srl TMP3, 8, TMP3 C 1
+ and TMP1, 0xff, TMP1 C 2
+ xor TMP3, TMP2, TMP2 C 1
+ ldub [T + TMP1], TMP1 C 2
+ stb TMP2, [$8 + 4*$1 + 1] C 1 E1
+ srl $6, 24, TMP2 C 3
+ srl TMP3, 8, TMP3 C 2
+ ldub [T + TMP2], TMP2 C 3
+ xor TMP3, TMP1, TMP1 C 2
+ srl TMP3, 8, TMP3 C 3
+ stb TMP1, [$8 + 4*$1 + 2] C 2 E2
+ xor TMP3, TMP2, TMP2 C 3
+ stb TMP2, [$8 + 4*$1 + 3] C 3 E3
+>)
diff --git a/sparc32/arcfour-crypt.asm b/sparc32/arcfour-crypt.asm
new file mode 100644
index 00000000..4d8dac94
--- /dev/null
+++ b/sparc32/arcfour-crypt.asm
@@ -0,0 +1,230 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2002, 2005 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+C Define to YES, to enable the complex code to special case SRC
+C and DST with compatible alignment.
+
+define(<WITH_ALIGN>, <YES>)
+
+C Registers
+
+define(<CTX>, <%i0>)
+define(<LENGTH>,<%i1>)
+define(<DST>, <%i2>)
+define(<SRC>, <%i3>)
+
+define(<I1>, <%i4>)
+define(<I2>, <%i5>)
+define(<J>, <%g1>)
+define(<SI>, <%g2>)
+define(<SJ>, <%g3>)
+define(<TMP>, <%o0>)
+define(<TMP2>, <%o1>)
+define(<N>, <%o2>)
+define(<DATA>, <%o3>)
+
+C Computes the next byte of the key stream. As input, i must
+C already point to the index for the current access, the index
+C for the next access is stored in ni. The resulting key byte is
+C stored in res.
+C ARCFOUR_BYTE(i, ni, res)
+define(<ARCFOUR_BYTE>, <
+ ldub [CTX + $1], SI
+ add $1, 1, $2
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ and $2, 0xff, $2
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ stb SJ, [CTX + $1]
+ ldub [CTX + SI], $3
+>)dnl
+
+C FIXME: Consider using the callers window
+define(<FRAME_SIZE>, 104)
+
+ .file "arcfour-crypt.asm"
+
+ C arcfour_crypt(struct arcfour_ctx *ctx,
+ C unsigned length, uint8_t *dst,
+ C const uint8_t *src)
+
+ .section ".text"
+ .align 16
+ .proc 020
+
+PROLOGUE(nettle_arcfour_crypt)
+
+ save %sp, -FRAME_SIZE, %sp
+ cmp LENGTH, 0
+ be .Lend
+ nop
+
+ C Load both I and J
+ lduh [CTX + ARCFOUR_I], I1
+ and I1, 0xff, J
+ srl I1, 8, I1
+
+ C We want an even address for DST
+ andcc DST, 1, %g0
+ add I1, 1 ,I1
+ beq .Laligned2
+ and I1, 0xff, I1
+
+ mov I1, I2
+ ldub [SRC], DATA
+ ARCFOUR_BYTE(I2, I1, TMP)
+ subcc LENGTH, 1, LENGTH
+ add SRC, 1, SRC
+ xor DATA, TMP, DATA
+ stb DATA, [DST]
+ beq .Ldone
+ add DST, 1, DST
+
+.Laligned2:
+
+ cmp LENGTH, 2
+ blu .Lfinal1
+ C Harmless delay slot instruction
+ andcc DST, 2, %g0
+ beq .Laligned4
+ nop
+
+ ldub [SRC], DATA
+ ARCFOUR_BYTE(I1, I2, TMP)
+ ldub [SRC + 1], TMP2
+ add SRC, 2, SRC
+ xor DATA, TMP, DATA
+ sll DATA, 8, DATA
+
+ ARCFOUR_BYTE(I2, I1, TMP)
+ xor TMP2, TMP, TMP
+ subcc LENGTH, 2, LENGTH
+ or DATA, TMP, DATA
+
+ sth DATA, [DST]
+ beq .Ldone
+ add DST, 2, DST
+
+.Laligned4:
+ cmp LENGTH, 4
+ blu .Lfinal2
+ C Harmless delay slot instruction
+ srl LENGTH, 2, N
+
+.Loop:
+ C Main loop, with aligned writes
+
+ C FIXME: Could check if SRC is aligned, and
+ C use 32-bit reads in that case.
+
+ ldub [SRC], DATA
+ ARCFOUR_BYTE(I1, I2, TMP)
+ ldub [SRC + 1], TMP2
+ xor TMP, DATA, DATA
+ sll DATA, 8, DATA
+
+ ARCFOUR_BYTE(I2, I1, TMP)
+ xor TMP2, TMP, TMP
+ ldub [SRC + 2], TMP2
+ or TMP, DATA, DATA
+ sll DATA, 8, DATA
+
+ ARCFOUR_BYTE(I1, I2, TMP)
+ xor TMP2, TMP, TMP
+ ldub [SRC + 3], TMP2
+ or TMP, DATA, DATA
+ sll DATA, 8, DATA
+
+ ARCFOUR_BYTE(I2, I1, TMP)
+ xor TMP2, TMP, TMP
+ or TMP, DATA, DATA
+ subcc N, 1, N
+ add SRC, 4, SRC
+ st DATA, [DST]
+ bne .Loop
+ add DST, 4, DST
+
+ andcc LENGTH, 3, LENGTH
+ beq .Ldone
+ nop
+
+.Lfinal2:
+ C DST address must be 2-aligned
+ cmp LENGTH, 2
+ blu .Lfinal1
+ nop
+
+ ldub [SRC], DATA
+ ARCFOUR_BYTE(I1, I2, TMP)
+ ldub [SRC + 1], TMP2
+ add SRC, 2, SRC
+ xor DATA, TMP, DATA
+ sll DATA, 8, DATA
+
+ ARCFOUR_BYTE(I2, I1, TMP)
+ xor TMP2, TMP, TMP
+ or DATA, TMP, DATA
+
+ sth DATA, [DST]
+ beq .Ldone
+ add DST, 2, DST
+
+.Lfinal1:
+ mov I1, I2
+ ldub [SRC], DATA
+ ARCFOUR_BYTE(I2, I1, TMP)
+ xor DATA, TMP, DATA
+ stb DATA, [DST]
+
+.Ldone:
+ C Save back I and J
+ sll I2, 8, I2
+ or I2, J, I2
+ stuh I2, [CTX + ARCFOUR_I]
+
+.Lend:
+ ret
+ restore
+
+EPILOGUE(nettle_arcfour_crypt)
+
+C Some stats from adriana.lysator.liu.se (SS1000E, 85 MHz), for AES 128
+
+C 1: nettle-1.13 C-code
+C 2: First working version of the assembler code
+C 3: Moved load of source byte
+C 4: Better instruction scheduling
+C 5: Special case SRC and DST with compatible alignment
+C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
+C 7: Unrolled only twice, with byte-accesses
+C 8: Unrolled, using 8-bit reads and aligned 32-bit writes.
+
+C MB/s cycles/byte Code size (bytes)
+C 1: 6.6 12.4 132
+C 2: 5.6 14.5 116
+C 3: 6.0 13.5 116
+C 4: 6.5 12.4 116
+C 5: 7.9 10.4 496
+C 6: 8.3 9.7 496
+C 7: 6.7 12.1 268
+C 8: 8.3 9.8 768
diff --git a/sparc32/machine.m4 b/sparc32/machine.m4
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/sparc32/machine.m4