summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2005-10-19 09:46:54 +0200
committerNiels Möller <nisse@lysator.liu.se>2005-10-19 09:46:54 +0200
commit1128d5a6195290466e157665cd113b05aa485cdd (patch)
tree416ecb334a3002ef8f3befc27ecbcf57cea47037
parent5865594aaa3e4e331ffe80b038d31ba7f1f4e820 (diff)
downloadnettle-1128d5a6195290466e157665cd113b05aa485cdd.tar.gz
* sparc/arcfour-crypt.asm: Special unrolled code if SRC and DST
have compatible alignment. Improves performance by 20%, but I'm not sure it's worth the extra complexity. Rev: src/nettle/sparc/arcfour-crypt.asm:1.5
-rw-r--r--sparc/arcfour-crypt.asm161
1 files changed, 140 insertions, 21 deletions
diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm
index beadd91e..0dd9e363 100644
--- a/sparc/arcfour-crypt.asm
+++ b/sparc/arcfour-crypt.asm
@@ -18,7 +18,12 @@ C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
C MA 02111-1307, USA.
-C Registers
+C Define to YES, to enable the complex code to special case SRC
+C and DST with compatible alignment.
+
+define(<WITH_ALIGN>, <NO>)
+
+C Registers
define(<CTX>, <%i0>)
define(<LENGTH>,<%i1>)
@@ -30,9 +35,103 @@ define(<J>, <%i5>)
define(<SI>, <%g1>)
define(<SJ>, <%g2>)
define(<TMP>, <%g3>)
+define(<N>, <%o0>)
+define(<WORD>, <%o1>)
-C FIXME: Consider using the callers window
+C Encrypts n bytes, one byte at a time.
+C ARCFOUR_BYTE_LOOP(n, label)
+define(<ARCFOUR_BYTE_LOOP>, <
+$2:
+ add I, 1, I
+ and I, 0xff, I
+ ldub [CTX + I], SI
+ subcc $1,1,$1
+ ldub [SRC], TMP
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ add SRC, 1, SRC
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ ldub [CTX + SI], SI
+ stb SJ, [CTX + I]
+ xor TMP, SI, TMP
+ stb TMP, [DST]
+ bne $2
+ add DST, 1, DST
+>)dnl
+
+C Encrypts 4n bytes, four at a time. Requires proper alignmentof
+C SRC and DST.
+C ARCFOUR_WORD_LOOP(n, label)
+define(<ARCFOUR_WORD_LOOP>, <
+$2:
+ add I, 1, I
+ and I, 0xff, I
+ ldub [CTX + I], SI
+ ld [SRC], WORD
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ ldub [CTX + SI], TMP
+ stb SJ, [CTX + I]
+ add I, 1, I
+ and I, 0xff, I
+ ldub [CTX + I], SI
+ add SRC, 4, SRC
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ ldub [CTX + SI], SI
+ sll TMP, 8, TMP
+ stb SJ, [CTX + I]
+ or TMP, SI, TMP
+
+ add I, 1, I
+ and I, 0xff, I
+ ldub [CTX + I], SI
+ subcc $1, 1, $1
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ ldub [CTX + SI], SI
+ sll TMP, 8, TMP
+ stb SJ, [CTX + I]
+ or TMP, SI, TMP
+
+ add I, 1, I
+ and I, 0xff, I
+ ldub [CTX + I], SI
+ C empty slot
+ add J, SI, J
+ and J, 0xff, J
+ ldub [CTX + J], SJ
+ stb SI, [CTX + J]
+ add SI, SJ, SI
+ and SI, 0xff, SI
+ ldub [CTX + SI], SI
+ sll TMP, 8, TMP
+ stb SJ, [CTX + I]
+ or TMP, SI, TMP
+ xor WORD, TMP, WORD
+ st WORD, [DST]
+
+ bne $2
+ add DST, 4, DST
+>)dnl
+
+C FIXME: Consider using the callers window
define(<FRAME_SIZE>, 104)
.file "arcfour-crypt.asm"
@@ -56,26 +155,44 @@ PROLOGUE(nettle_arcfour_crypt)
and I, 0xff, J
srl I, 8, I
-.Loop:
- add I, 1, I
- and I, 0xff, I
- ldub [CTX + I], SI
- subcc LENGTH,1,LENGTH
- ldub [SRC], TMP
- add J, SI, J
- and J, 0xff, J
- ldub [CTX + J], SJ
- add SRC, 1, SRC
- stb SI, [CTX + J]
- add SI, SJ, SI
- and SI, 0xff, SI
- ldub [CTX + SI], SI
- stb SJ, [CTX + I]
- xor TMP, SI, TMP
- stb TMP, [DST]
- bne .Loop
- add DST, 1, DST
+ifelse(WITH_ALIGN, YES, <
+ C Check if SRC and DST have compatible alignment
+ xor SRC, DST, TMP
+ andcc TMP, 3, TMP
+
+ bne .Lrest
+ nop
+
+ andcc DST, 3, N
+ bz .Laligned
+ nop
+
+ sub N, 4, N
+ neg N
+ cmp N, LENGTH
+ bgeu .Lrest
+ nop
+
+ sub LENGTH, N, LENGTH
+
+ ARCFOUR_BYTE_LOOP(N, .Lunalignedloop)
+
+.Laligned:
+ srl LENGTH, 2, N
+ cmp N, 0
+ be .Lrest
+ nop
+
+ ARCFOUR_WORD_LOOP(N, .Lalignedloop)
+
+ andcc LENGTH, 3, LENGTH
+ bz .Ldone
+ nop
+>)
+.Lrest:
+ ARCFOUR_BYTE_LOOP(LENGTH, .Loop)
+.Ldone:
C Save back I and J
sll I, 8, I
or I, J, I
@@ -93,9 +210,11 @@ C 1: nettle-1.13 C-code
C 2: First working version of the assembler code
C 3: Moved load of source byte
C 4: Better instruction scheduling
+C 5: Special case SRC and DST with compatible alignment
C MB/s cycles/byte Code size (bytes)
C 1: 6.6 12.4 132
C 2: 5.6 14.5 116
C 3: 6.0 13.5 116
C 4: 6.5 12.4 116
+C 5: 7.9 10.4 496