5 files changed, 601 insertions, 0 deletions
diff --git a/sparc32/aes-decrypt-internal.asm b/sparc32/aes-decrypt-internal.asm
new file mode 100644
index 00000000..750e3d21
--- /dev/null
+++ b/sparc32/aes-decrypt-internal.asm
@@ -0,0 +1,132 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-  
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2002, 2005 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+include_src(<sparc/aes.m4>)
+
+C	Arguments
+define(<CTX>,	<%i0>)
+define(<T>,	<%i1>)
+define(<LENGTH>,<%i2>)
+define(<DST>,	<%i3>)
+define(<SRC>,	<%i4>)
+
+C	AES state, two copies for unrolling
+
+define(<W0>,	<%l0>)
+define(<W1>,	<%l1>)
+define(<W2>,	<%l2>)
+define(<W3>,	<%l3>)
+
+define(<X0>,	<%l4>)
+define(<X1>,	<%l5>)
+define(<X2>,	<%l6>)
+define(<X3>,	<%l7>)
+
+C	%o0-%03 are used for loop invariants T0-T3
+define(<KEY>,	<%o4>)
+define(<ROUND>, <%o5>)
+
+C %g1, %g2, %g3 are TMP1, TMP2 and TMP3
+
+C The sparc32 stack frame looks like
+C
+C %fp -   4: OS-dependent link field
+C %fp -   8: OS-dependent link field
+C %fp - 104: OS register save area. 
+define(<FRAME_SIZE>, 104)
+
+	.file "aes-decrypt-internal.asm"
+
+	C _aes_decrypt(struct aes_context *ctx, 
+	C	       const struct aes_table *T,
+	C	       unsigned length, uint8_t *dst,
+	C	       uint8_t *src)
+
+	.section	".text"
+	.align 16
+	.proc	020
+	
+PROLOGUE(_nettle_aes_decrypt)
+
+	save	%sp, -FRAME_SIZE, %sp
+	cmp	LENGTH, 0
+	be	.Lend
+
+	C	Loop invariants
+	add	T, AES_TABLE0, T0
+	add	T, AES_TABLE1, T1
+	add	T, AES_TABLE2, T2
+	add	T, AES_TABLE3, T3
+
+.Lblock_loop:
+	C  Read src, and add initial subkey
+	add	CTX, AES_KEYS, KEY
+	AES_LOAD(0, SRC, KEY, W0)
+	AES_LOAD(1, SRC, KEY, W1)
+	AES_LOAD(2, SRC, KEY, W2)
+	AES_LOAD(3, SRC, KEY, W3)
+
+	C	Must be even, and includes the final round
+	ld	[AES_NROUNDS + CTX], ROUND
+	add	SRC, 16, SRC
+	add	KEY, 16, KEY
+
+	srl	ROUND, 1, ROUND
+	C	Last two rounds handled specially
+	sub	ROUND, 1, ROUND
+.Lround_loop:
+	C The AES_ROUND macro uses T0,... T3
+	C	Transform W -> X
+	AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
+	AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
+	AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
+	AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+
+	C	Transform X -> W
+	AES_ROUND(4, X0, X3, X2, X1, KEY, W0)
+	AES_ROUND(5, X1, X0, X3, X2, KEY, W1)
+	AES_ROUND(6, X2, X1, X0, X3, KEY, W2)
+	AES_ROUND(7, X3, X2, X1, X0, KEY, W3)
+
+	subcc	ROUND, 1, ROUND
+	bne	.Lround_loop
+	add	KEY, 32, KEY
+
+	C	Penultimate round
+	AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
+	AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
+	AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
+	AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+
+	add	KEY, 16, KEY
+	C	Final round
+	AES_FINAL_ROUND(0, T, X0, X3, X2, X1, KEY, DST)
+	AES_FINAL_ROUND(1, T, X1, X0, X3, X2, KEY, DST)
+	AES_FINAL_ROUND(2, T, X2, X1, X0, X3, KEY, DST)
+	AES_FINAL_ROUND(3, T, X3, X2, X1, X0, KEY, DST)
+
+	subcc	LENGTH, 16, LENGTH
+	bne	.Lblock_loop
+	add	DST, 16, DST
+
+.Lend:
+	ret
+	restore
+EPILOGUE(_nettle_aes_decrypt)
diff --git a/sparc32/aes-encrypt-internal.asm b/sparc32/aes-encrypt-internal.asm
new file mode 100644
index 00000000..92d6fc0e
--- /dev/null
+++ b/sparc32/aes-encrypt-internal.asm
@@ -0,0 +1,156 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-  
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2002, 2005 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+include_src(<sparc/aes.m4>)
+
+C	Arguments
+define(<CTX>,	<%i0>)
+define(<T>,	<%i1>)
+define(<LENGTH>,<%i2>)
+define(<DST>,	<%i3>)
+define(<SRC>,	<%i4>)
+
+C	AES state, two copies for unrolling
+
+define(<W0>,	<%l0>)
+define(<W1>,	<%l1>)
+define(<W2>,	<%l2>)
+define(<W3>,	<%l3>)
+
+define(<X0>,	<%l4>)
+define(<X1>,	<%l5>)
+define(<X2>,	<%l6>)
+define(<X3>,	<%l7>)
+
+C	%o0-%03 are used for loop invariants T0-T3
+define(<KEY>,	<%o4>)
+define(<ROUND>, <%o5>)
+
+C %g1, %g2, %g3 are TMP1, TMP2 and TMP3
+
+C I'm still slightly confused by the frame layout, specified in
+C "SYSTEM V APPLICATION BINARY INTERFACE SPARC Processor Supplement".
+C However, Sun's cc generates a 104 byte stack frame for a function
+C with no local variables, so that should be good enough for us too.
+
+C The sparc32 stack frame looks like
+C
+C %fp -   4: OS-dependent link field
+C %fp -   8: OS-dependent link field
+C %fp - 104: OS register save area 
+define(<FRAME_SIZE>, 104)
+
+	.file "aes-encrypt-internal.asm"
+
+	C _aes_encrypt(struct aes_context *ctx, 
+	C	       const struct aes_table *T,
+	C	       unsigned length, uint8_t *dst,
+	C	       uint8_t *src)
+
+	.section	".text"
+	.align 16
+	.proc	020
+	
+PROLOGUE(_nettle_aes_encrypt)
+
+	save	%sp, -FRAME_SIZE, %sp
+	cmp	LENGTH, 0
+	be	.Lend
+
+	C	Loop invariants
+	add	T, AES_TABLE0, T0
+	add	T, AES_TABLE1, T1
+	add	T, AES_TABLE2, T2
+	add	T, AES_TABLE3, T3
+
+.Lblock_loop:
+	C  Read src, and add initial subkey
+	add	CTX, AES_KEYS, KEY
+	AES_LOAD(0, SRC, KEY, W0)
+	AES_LOAD(1, SRC, KEY, W1)
+	AES_LOAD(2, SRC, KEY, W2)
+	AES_LOAD(3, SRC, KEY, W3)
+
+	C	Must be even, and includes the final round
+	ld	[AES_NROUNDS + CTX], ROUND
+	add	SRC, 16, SRC
+	add	KEY, 16, KEY
+
+	srl	ROUND, 1, ROUND
+	C	Last two rounds handled specially
+	sub	ROUND, 1, ROUND
+.Lround_loop:
+	C The AES_ROUND macro uses T0,... T3
+	C	Transform W -> X
+	AES_ROUND(0, W0, W1, W2, W3, KEY, X0)
+	AES_ROUND(1, W1, W2, W3, W0, KEY, X1)
+	AES_ROUND(2, W2, W3, W0, W1, KEY, X2)
+	AES_ROUND(3, W3, W0, W1, W2, KEY, X3)
+
+	C	Transform X -> W
+	AES_ROUND(4, X0, X1, X2, X3, KEY, W0)
+	AES_ROUND(5, X1, X2, X3, X0, KEY, W1)
+	AES_ROUND(6, X2, X3, X0, X1, KEY, W2)
+	AES_ROUND(7, X3, X0, X1, X2, KEY, W3)
+
+	subcc	ROUND, 1, ROUND
+	bne	.Lround_loop
+	add	KEY, 32, KEY
+
+	C	Penultimate round
+	AES_ROUND(0, W0, W1, W2, W3, KEY, X0)
+	AES_ROUND(1, W1, W2, W3, W0, KEY, X1)
+	AES_ROUND(2, W2, W3, W0, W1, KEY, X2)
+	AES_ROUND(3, W3, W0, W1, W2, KEY, X3)
+
+	add	KEY, 16, KEY
+	C	Final round
+	AES_FINAL_ROUND(0, T, X0, X1, X2, X3, KEY, DST)
+	AES_FINAL_ROUND(1, T, X1, X2, X3, X0, KEY, DST)
+	AES_FINAL_ROUND(2, T, X2, X3, X0, X1, KEY, DST)
+	AES_FINAL_ROUND(3, T, X3, X0, X1, X2, KEY, DST)
+
+	subcc	LENGTH, 16, LENGTH
+	bne	.Lblock_loop
+	add	DST, 16, DST
+
+.Lend:
+	ret
+	restore
+EPILOGUE(_nettle_aes_encrypt)
+
+C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128
+
+C 1:	nettle-1.13 C-code
+C 2:	nettle-1.13 assembler
+C 3:	New C-code
+C 4:	New assembler, first correct version
+C 5:	New assembler, with basic scheduling of AES_ROUND.
+C 6:	New assembpler, with loop invariants T0-T3.
+C 7:	New assembler, with basic scheduling also of AES_FINAL_ROUND.
+	
+C	MB/s	cycles/block	Code size (bytes)
+C 1	1.2	1107		592
+C 2	2.3	572		1032
+C 3	2.1	627
+C 4	1.8	722
+C 5	2.6	496
+C 6	3.0	437
+C 7	3.1	415		1448
diff --git a/sparc32/aes.m4 b/sparc32/aes.m4
new file mode 100644
index 00000000..05f465e0
--- /dev/null
+++ b/sparc32/aes.m4
@@ -0,0 +1,83 @@
+C Used as temporaries by the AES macros
+define(<TMP1>, <%g1>)
+define(<TMP2>, <%g2>)
+define(<TMP3>, <%g3>)
+
+C Loop invariants used by AES_ROUND
+define(<T0>,	<%o0>)
+define(<T1>,	<%o1>)
+define(<T2>,	<%o2>)
+define(<T3>,	<%o3>)
+
+C AES_LOAD(i, src, key, res)
+define(<AES_LOAD>, <
+	ldub	[$2 + 4*$1], $4
+	ldub	[$2 + 4*$1 + 1], TMP1
+	ldub	[$2 + 4*$1 + 2], TMP2
+	sll	TMP1, 8, TMP1
+	
+	or	$4, TMP1, $4
+	ldub	[$2 + 4*$1+3], TMP1
+	sll	TMP2, 16, TMP2
+	or	$4, TMP2, $4
+	
+	sll	TMP1, 24, TMP1
+	C	Get subkey
+	ld	[$3 + 4*$1], TMP2
+	or	$4, TMP1, $4
+	xor	$4, TMP2, $4>)dnl
+
+C AES_ROUND(i, a, b, c, d, key, res)
+C Computes one word of the AES round
+C FIXME: Could use registers pointing directly to the four tables
+C FIXME: Needs better instruction scheduling, and perhaps more temporaries
+C Alternatively, we can use a single table and some rotations
+define(<AES_ROUND>, <
+	and	$2, 0xff, TMP1		C  0
+	srl	$3, 6, TMP2		C  1
+	sll	TMP1, 2, TMP1		C  0
+	and	TMP2, 0x3fc, TMP2	C  1
+	ld	[T0 + TMP1], $7		C  0	E0
+	srl	$4, 14, TMP1		C  2
+	ld	[T1 + TMP2], TMP2	C  1
+	and	TMP1, 0x3fc, TMP1	C  2
+	xor	$7, TMP2, $7		C  1	E1
+	srl	$5, 22, TMP2		C  3
+	ld	[T2 + TMP1], TMP1	C  2
+	and	TMP2, 0x3fc, TMP2	C  3
+	xor	$7, TMP1, $7		C  2	E2
+	ld	[$6 + 4*$1], TMP1	C  4
+	ld	[T3 + TMP2], TMP2	C  3
+	xor	$7, TMP1, $7		C  4	E4
+	xor	$7, TMP2, $7		C  3	E3
+>)dnl
+
+C AES_FINAL_ROUND(i, T, a, b, c, d, key, dst)
+C Compute one word in the final round function. Output is converted to
+C octets and stored at dst. Relies on AES_SBOX being zero.
+define(<AES_FINAL_ROUND>, <
+	C	Load subkey
+	ld	[$7 + 4*$1], TMP3
+
+	and	$3, 0xff, TMP1		C  0
+	srl	$4, 8, TMP2		C  1
+	ldub	[T + TMP1], TMP1	C  0
+	and	TMP2, 0xff, TMP2	C  1
+	xor	TMP3, TMP1, TMP1	C  0
+	ldub	[T + TMP2], TMP2	C  1
+	stb	TMP1, [$8 + 4*$1]	C  0	E0
+	srl	$5, 16, TMP1		C  2
+	srl	TMP3, 8, TMP3		C  1
+	and	TMP1, 0xff, TMP1	C  2
+	xor	TMP3, TMP2, TMP2	C  1
+	ldub	[T + TMP1], TMP1	C  2
+	stb	TMP2, [$8 + 4*$1 + 1]	C  1	E1
+	srl	$6, 24, TMP2		C  3
+	srl	TMP3, 8, TMP3		C  2
+	ldub	[T + TMP2], TMP2	C  3
+	xor	TMP3, TMP1, TMP1	C  2
+	srl	TMP3, 8, TMP3		C  3
+	stb	TMP1, [$8 + 4*$1 + 2]	C  2	E2
+	xor	TMP3, TMP2, TMP2	C  3
+	stb	TMP2, [$8 + 4*$1 + 3]	C  3	E3
+>)
diff --git a/sparc32/arcfour-crypt.asm b/sparc32/arcfour-crypt.asm
new file mode 100644
index 00000000..4d8dac94
--- /dev/null
+++ b/sparc32/arcfour-crypt.asm
@@ -0,0 +1,230 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-  
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2002, 2005 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+C	Define to YES, to enable the complex code to special case SRC
+C	and DST with compatible alignment.
+	
+define(<WITH_ALIGN>, <YES>)
+
+C	Registers
+
+define(<CTX>,	<%i0>)
+define(<LENGTH>,<%i1>)
+define(<DST>,	<%i2>)
+define(<SRC>,	<%i3>)
+
+define(<I1>,	<%i4>)
+define(<I2>,	<%i5>)
+define(<J>,	<%g1>)
+define(<SI>,	<%g2>)
+define(<SJ>,	<%g3>)
+define(<TMP>,	<%o0>)
+define(<TMP2>,	<%o1>)
+define(<N>,	<%o2>)
+define(<DATA>,	<%o3>)
+
+C	Computes the next byte of the key stream. As input, i must
+C	already point to the index for the current access, the index
+C	for the next access is stored in ni. The resulting key byte is
+C	stored in res.
+C	ARCFOUR_BYTE(i, ni, res)
+define(<ARCFOUR_BYTE>, <
+	ldub	[CTX + $1], SI
+	add	$1, 1, $2
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	and	$2, 0xff, $2
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	stb	SJ, [CTX + $1]
+	ldub	[CTX + SI], $3
+>)dnl
+			
+C	FIXME: Consider using the callers window
+define(<FRAME_SIZE>, 104)
+
+	.file "arcfour-crypt.asm"
+
+	C arcfour_crypt(struct arcfour_ctx *ctx,
+	C               unsigned length, uint8_t *dst,
+	C               const uint8_t *src)
+
+	.section	".text"
+	.align 16
+	.proc	020
+	
+PROLOGUE(nettle_arcfour_crypt)
+
+	save	%sp, -FRAME_SIZE, %sp
+	cmp	LENGTH, 0
+	be	.Lend
+	nop
+	
+	C	Load both I and J
+	lduh	[CTX + ARCFOUR_I], I1
+	and	I1, 0xff, J
+	srl	I1, 8, I1
+
+	C	We want an even address for DST
+	andcc	DST, 1, %g0
+	add	I1, 1 ,I1
+	beq	.Laligned2
+	and	I1, 0xff, I1
+
+	mov	I1, I2
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
+	subcc	LENGTH, 1, LENGTH
+	add	SRC, 1, SRC
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+	beq	.Ldone
+	add	DST, 1, DST
+
+.Laligned2:
+
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	C	Harmless delay slot instruction	
+	andcc	DST, 2, %g0
+	beq	.Laligned4
+	nop
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	sll	DATA, 8, DATA	
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	subcc	LENGTH, 2, LENGTH
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
+	add	DST, 2, DST
+	
+.Laligned4:
+	cmp	LENGTH, 4
+	blu	.Lfinal2
+	C	Harmless delay slot instruction
+	srl	LENGTH, 2, N
+	
+.Loop:
+	C	Main loop, with aligned writes
+	
+	C	FIXME: Could check if SRC is aligned, and
+	C	use 32-bit reads in that case.
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	xor	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 2], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I1, I2, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 3], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	TMP, DATA, DATA
+	subcc	N, 1, N
+	add	SRC, 4, SRC
+	st	DATA, [DST]
+	bne	.Loop
+	add	DST, 4, DST
+	
+	andcc	LENGTH, 3, LENGTH
+	beq	.Ldone
+	nop
+
+.Lfinal2:
+	C	DST address must be 2-aligned
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	nop
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	sll	DATA, 8, DATA	
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
+	add	DST, 2, DST
+
+.Lfinal1:
+	mov	I1, I2
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+
+.Ldone:
+	C	Save back I and J
+	sll	I2, 8, I2
+	or	I2, J, I2
+	stuh	I2, [CTX + ARCFOUR_I]
+
+.Lend:
+	ret
+	restore
+
+EPILOGUE(nettle_arcfour_crypt)
+
+C Some stats from adriana.lysator.liu.se (SS1000E, 85 MHz), for AES 128
+
+C 1:	nettle-1.13 C-code
+C 2:	First working version of the assembler code
+C 3:	Moved load of source byte
+C 4:	Better instruction scheduling
+C 5:	Special case SRC and DST with compatible alignment
+C 6:	After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
+C 7:	Unrolled only twice, with byte-accesses
+C 8:	Unrolled, using 8-bit reads and aligned 32-bit writes.
+
+C	MB/s	cycles/byte	Code size (bytes)
+C 1:	6.6	12.4		132
+C 2:	5.6	14.5		116
+C 3:	6.0	13.5		116
+C 4:	6.5	12.4		116
+C 5:	7.9	10.4		496
+C 6:	8.3	9.7		496
+C 7:	6.7	12.1		268
+C 8:	8.3	9.8		768
diff --git a/sparc32/machine.m4 b/sparc32/machine.m4
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/sparc32/machine.m4