summaryrefslogtreecommitdiff
path: root/security/nss/lib/freebl/mpi/mpi_amd64_gas.s
diff options
context:
space:
mode:
authorjulien.pierre.bugs%sun.com <devnull@localhost>2005-02-25 04:30:11 +0000
committerjulien.pierre.bugs%sun.com <devnull@localhost>2005-02-25 04:30:11 +0000
commit1d3bc8d5e15d59f6e97b0a2cb9efc94a2470acc6 (patch)
tree53c2c736a07f6aa9409908da3e7e9105211774a0 /security/nss/lib/freebl/mpi/mpi_amd64_gas.s
parentc6bfac456009704606b42fb7c6e33b0b69017555 (diff)
downloadnss-hg-1d3bc8d5e15d59f6e97b0a2cb9efc94a2470acc6.tar.gz
Fix for bug 272327 . AMD64 assembly optimization for bignum multiply. r=nelson
Diffstat (limited to 'security/nss/lib/freebl/mpi/mpi_amd64_gas.s')
-rw-r--r--security/nss/lib/freebl/mpi/mpi_amd64_gas.s418
1 files changed, 418 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/mpi/mpi_amd64_gas.s b/security/nss/lib/freebl/mpi/mpi_amd64_gas.s
new file mode 100644
index 000000000..7515ac20a
--- /dev/null
+++ b/security/nss/lib/freebl/mpi/mpi_amd64_gas.s
@@ -0,0 +1,418 @@
+# ***** BEGIN LICENSE BLOCK *****
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is the Solaris software cryptographic token.
+#
+# The Initial Developer of the Original Code is
+# Sun Microsystems, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2005
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Sun Microsystems, Inc.
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ***** END LICENSE BLOCK ***** */
+
+
+# ------------------------------------------------------------------------
+#
+# Implementation of s_mpv_mul_set_vec which exploits
+# the 64X64->128 bit unsigned multiply instruction.
+#
+# ------------------------------------------------------------------------
+
+# r = a * digit, r and a are vectors of length len
+# returns the carry digit
+# r and a are 64 bit aligned.
+#
+# uint64_t
+# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+#
+
+.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
+
+ xorq %rax, %rax # if (len == 0) return (0)
+ testq %rdx, %rdx
+ jz .L17
+
+ movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
+ xorq %r9, %r9 # cy = 0
+
+.L15:
+ cmpq $8, %r8 # 8 - len
+ jb .L16
+ movq 0(%rsi), %rax # rax = a[0]
+ movq 8(%rsi), %r11 # prefetch a[1]
+ mulq %rcx # p = a[0] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 0(%rdi) # r[0] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 16(%rsi), %r11 # prefetch a[2]
+ mulq %rcx # p = a[1] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 8(%rdi) # r[1] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 24(%rsi), %r11 # prefetch a[3]
+ mulq %rcx # p = a[2] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 16(%rdi) # r[2] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 32(%rsi), %r11 # prefetch a[4]
+ mulq %rcx # p = a[3] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 24(%rdi) # r[3] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 40(%rsi), %r11 # prefetch a[5]
+ mulq %rcx # p = a[4] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 32(%rdi) # r[4] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 48(%rsi), %r11 # prefetch a[6]
+ mulq %rcx # p = a[5] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 40(%rdi) # r[5] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 56(%rsi), %r11 # prefetch a[7]
+ mulq %rcx # p = a[6] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 48(%rdi) # r[6] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ mulq %rcx # p = a[7] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 56(%rdi) # r[7] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ addq $64, %rsi
+ addq $64, %rdi
+ subq $8, %r8
+
+ jz .L17
+ jmp .L15
+
+.L16:
+ movq 0(%rsi), %rax
+ mulq %rcx # p = a[0] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 0(%rdi) # r[0] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 8(%rsi), %rax
+ mulq %rcx # p = a[1] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 8(%rdi) # r[1] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 16(%rsi), %rax
+ mulq %rcx # p = a[2] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 16(%rdi) # r[2] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 24(%rsi), %rax
+ mulq %rcx # p = a[3] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 24(%rdi) # r[3] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 32(%rsi), %rax
+ mulq %rcx # p = a[4] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 32(%rdi) # r[4] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 40(%rsi), %rax
+ mulq %rcx # p = a[5] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 40(%rdi) # r[5] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+ movq 48(%rsi), %rax
+ mulq %rcx # p = a[6] * digit
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 48(%rdi) # r[6] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L17
+
+
+.L17:
+ movq %r9, %rax
+ ret
+
+.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
+
+# ------------------------------------------------------------------------
+#
+# Implementation of s_mpv_mul_add_vec which exploits
+# the 64X64->128 bit unsigned multiply instruction.
+#
+# ------------------------------------------------------------------------
+
+# r += a * digit, r and a are vectors of length len
+# returns the carry digit
+# r and a are 64 bit aligned.
+#
+# uint64_t
+# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+#
+
+.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
+
+ xorq %rax, %rax # if (len == 0) return (0)
+ testq %rdx, %rdx
+ jz .L27
+
+ movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
+ xorq %r9, %r9 # cy = 0
+
+.L25:
+ cmpq $8, %r8 # 8 - len
+ jb .L26
+ movq 0(%rsi), %rax # rax = a[0]
+ movq 0(%rdi), %r10 # r10 = r[0]
+ movq 8(%rsi), %r11 # prefetch a[1]
+ mulq %rcx # p = a[0] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[0]
+ movq 8(%rdi), %r10 # prefetch r[1]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 0(%rdi) # r[0] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 16(%rsi), %r11 # prefetch a[2]
+ mulq %rcx # p = a[1] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[1]
+ movq 16(%rdi), %r10 # prefetch r[2]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 8(%rdi) # r[1] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 24(%rsi), %r11 # prefetch a[3]
+ mulq %rcx # p = a[2] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[2]
+ movq 24(%rdi), %r10 # prefetch r[3]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 16(%rdi) # r[2] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 32(%rsi), %r11 # prefetch a[4]
+ mulq %rcx # p = a[3] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[3]
+ movq 32(%rdi), %r10 # prefetch r[4]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 24(%rdi) # r[3] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 40(%rsi), %r11 # prefetch a[5]
+ mulq %rcx # p = a[4] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[4]
+ movq 40(%rdi), %r10 # prefetch r[5]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 32(%rdi) # r[4] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 48(%rsi), %r11 # prefetch a[6]
+ mulq %rcx # p = a[5] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[5]
+ movq 48(%rdi), %r10 # prefetch r[6]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 40(%rdi) # r[5] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ movq 56(%rsi), %r11 # prefetch a[7]
+ mulq %rcx # p = a[6] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[6]
+ movq 56(%rdi), %r10 # prefetch r[7]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 48(%rdi) # r[6] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ movq %r11, %rax
+ mulq %rcx # p = a[7] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[7]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 56(%rdi) # r[7] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+
+ addq $64, %rsi
+ addq $64, %rdi
+ subq $8, %r8
+
+ jz .L27
+ jmp .L25
+
+.L26:
+ movq 0(%rsi), %rax
+ movq 0(%rdi), %r10
+ mulq %rcx # p = a[0] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[0]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 0(%rdi) # r[0] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 8(%rsi), %rax
+ movq 8(%rdi), %r10
+ mulq %rcx # p = a[1] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[1]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 8(%rdi) # r[1] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 16(%rsi), %rax
+ movq 16(%rdi), %r10
+ mulq %rcx # p = a[2] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[2]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 16(%rdi) # r[2] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 24(%rsi), %rax
+ movq 24(%rdi), %r10
+ mulq %rcx # p = a[3] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[3]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 24(%rdi) # r[3] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 32(%rsi), %rax
+ movq 32(%rdi), %r10
+ mulq %rcx # p = a[4] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[4]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 32(%rdi) # r[4] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 40(%rsi), %rax
+ movq 40(%rdi), %r10
+ mulq %rcx # p = a[5] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[5]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 40(%rdi) # r[5] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+ movq 48(%rsi), %rax
+ movq 48(%rdi), %r10
+ mulq %rcx # p = a[6] * digit
+ addq %r10, %rax
+ adcq $0, %rdx # p += r[6]
+ addq %r9, %rax
+ adcq $0, %rdx # p += cy
+ movq %rax, 48(%rdi) # r[6] = lo(p)
+ movq %rdx, %r9 # cy = hi(p)
+ decq %r8
+ jz .L27
+
+
+.L27:
+ movq %r9, %rax
+ ret
+
+.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]