summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcvs2svn <cvs2svn>2004-06-17 20:28:29 +0000
committercvs2svn <cvs2svn>2004-06-17 20:28:29 +0000
commit07e88f6269b97909ab3964e3a0f33f3cd8b0674e (patch)
treead350d2fb19ae171f3a2bfd17481ee077d2be27f
parent603804732a927ce77cc59fa5a09b6ac55bfe1817 (diff)
parentf18ea6cae92d0529de521b9c0a79f2846e1b4c29 (diff)
downloadopenssl-new-07e88f6269b97909ab3964e3a0f33f3cd8b0674e.tar.gz
This commit was manufactured by cvs2svn to create branch 'BRANCH_VMS_64BIT'.
-rw-r--r--VMS/VMSify-conf.pl34
-rw-r--r--certs/eng1.pem23
-rw-r--r--certs/eng2.pem23
-rw-r--r--certs/eng3.pem34
-rw-r--r--certs/eng4.pem23
-rw-r--r--certs/eng5.pem23
-rw-r--r--crypto/bn/asm/ppc.pl2081
-rw-r--r--crypto/evp/e_old.c114
-rw-r--r--crypto/sha/asm/sha512-sse2.pl391
-rw-r--r--crypto/sha/sha256.c309
-rw-r--r--crypto/sha/sha256t.c130
-rw-r--r--crypto/sha/sha512.c478
-rw-r--r--crypto/sha/sha512t.c168
-rw-r--r--doc/crypto/OPENSSL_Applink.pod21
-rw-r--r--ms/applink.c45
-rw-r--r--ms/uplink.c168
-rw-r--r--ms/uplink.h14
-rwxr-xr-xms/uplink.pl177
18 files changed, 4256 insertions, 0 deletions
diff --git a/VMS/VMSify-conf.pl b/VMS/VMSify-conf.pl
new file mode 100644
index 0000000000..d3be6a29e7
--- /dev/null
+++ b/VMS/VMSify-conf.pl
@@ -0,0 +1,34 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+
+my @directory_vars = ( "dir", "certs", "crl_dir", "new_certs_dir" );
+my @file_vars = ( "database", "certificate", "serial", "crlnumber",
+ "crl", "private_key", "RANDFILE" );
+while(<STDIN>) {
+ chomp;
+ foreach my $d (@directory_vars) {
+ if (/^(\s*\#?\s*${d}\s*=\s*)\.\/([^\s\#]*)([\s\#].*)$/) {
+ $_ = "$1sys\\\$disk:\[.$2$3";
+ } elsif (/^(\s*\#?\s*${d}\s*=\s*)(\w[^\s\#]*)([\s\#].*)$/) {
+ $_ = "$1sys\\\$disk:\[.$2$3";
+ }
+ s/^(\s*\#?\s*${d}\s*=\s*\$\w+)\/([^\s\#]*)([\s\#].*)$/$1.$2\]$3/;
+ while(/^(\s*\#?\s*${d}\s*=\s*(\$\w+\.|sys\\\$disk:\[\.)[\w\.]+)\/([^\]]*)\](.*)$/) {
+ $_ = "$1.$3]$4";
+ }
+ }
+ foreach my $f (@file_vars) {
+ s/^(\s*\#?\s*${f}\s*=\s*)\.\/(.*)$/$1sys\\\$disk:\[\/$2/;
+ while(/^(\s*\#?\s*${f}\s*=\s*(\$\w+|sys\\\$disk:\[)[^\/]*)\/(\w+\/[^\s\#]*)([\s\#].*)$/) {
+ $_ = "$1.$3$4";
+ }
+ if (/^(\s*\#?\s*${f}\s*=\s*(\$\w+|sys\\\$disk:\[)[^\/]*)\/(\w+)([\s\#].*)$/) {
+ $_ = "$1]$3.$4";
+ } elsif (/^(\s*\#?\s*${f}\s*=\s*(\$\w+|sys\\\$disk:\[)[^\/]*)\/([^\s\#]*)([\s\#].*)$/) {
+ $_ = "$1]$3$4";
+ }
+ }
+ print $_,"\n";
+}
diff --git a/certs/eng1.pem b/certs/eng1.pem
new file mode 100644
index 0000000000..7ed8b1b5e6
--- /dev/null
+++ b/certs/eng1.pem
@@ -0,0 +1,23 @@
+-----BEGIN CERTIFICATE-----
+MIID3TCCAsWgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBqDELMAkGA1UEBhMCQ0Ex
+CzAJBgNVBAgTAk9OMRAwDgYDVQQHEwdUb3JvbnRvMRgwFgYDVQQKEw9CYW5rRW5n
+aW5lIEluYy4xKTAnBgNVBAsTIENlcnRpZmljYXRpb24gQXV0aG9yaXR5IERpdmlz
+aW9uMRMwEQYDVQQDEwpiYW5rZW5naW5lMSAwHgYJKoZIhvcNAQkBFhFjYUBiYW5r
+ZW5naW5lLmNvbTAeFw05ODAxMDEwMDAwMDBaFw0zODAxMTcwMDAwMDBaMIGoMQsw
+CQYDVQQGEwJDQTELMAkGA1UECBMCT04xEDAOBgNVBAcTB1Rvcm9udG8xGDAWBgNV
+BAoTD0JhbmtFbmdpbmUgSW5jLjEpMCcGA1UECxMgQ2VydGlmaWNhdGlvbiBBdXRo
+b3JpdHkgRGl2aXNpb24xEzARBgNVBAMTCmJhbmtlbmdpbmUxIDAeBgkqhkiG9w0B
+CQEWEWNhQGJhbmtlbmdpbmUuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
+CgKCAQEA14LoTUAl1/hEy+Kh1kLHiBdW2zD3V4IhM7xxTVKsYsIH56nr69ATTIxU
+P36eRzeZ137qt1AxHFjDCidk3m1Ul6l59ProPexdslLLM2npM3f2cteg+toyiYiS
+EJKjyzIu1xF1j9qzGkymSY/4DsXLZNk9FaczxMk/Ooc6Os1M3AverL4VG4rYIb6f
+eR32cIKJ9Q1fGuyKk7ipq1XQfPW8a8TgZdbHbe7U9Gk3iasGMHHvpR9Ep3mGbgdT
+uQ98SBEuIwe1BUCGg/MXpVy48MNXfAMotBgGw4pl9yqSjMni2FB+E9Q9DHFs2RgX
+MqzKuo8zcPxKx2kZ6Arj8+27dw2clQIDAQABoxAwDjAMBgNVHRMEBTADAQH/MA0G
+CSqGSIb3DQEBBQUAA4IBAQBauupHX9EhpC/r57d6b5kkeWvognxIP9//TO4iw3qb
+zIXEkPXmJmwVzlzoKJWqiya+aw19SP0+G6CzsFOBo/9ehmz+hZ8bhYX4MjlWzX5u
+Tnkhz172j9fOBUmrTVPkcRIs6zjCD5PQAGoBPP1/Zdy2N36lZ0U7lg07Opirj/yJ
+PSJeM2j0fwIFAroiVckvdT0BVwB6S/cPaAQGPghbbr1YGSmYrMriSv825ILJUfxz
+rJYunGR9FiY9Ob7+jwJwiZMS4CxSPktutxr/3hOvr1+ALS7IcVakhhA3PuZAJbdH
+FRclR9qMM8aBnBZmf+Uv3K3uhT+UBzzY654U9Yi1JYnA
+-----END CERTIFICATE-----
diff --git a/certs/eng2.pem b/certs/eng2.pem
new file mode 100644
index 0000000000..73066309b1
--- /dev/null
+++ b/certs/eng2.pem
@@ -0,0 +1,23 @@
+-----BEGIN CERTIFICATE-----
+MIID3TCCAsWgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBqDELMAkGA1UEBhMCQ0Ex
+CzAJBgNVBAgTAk9OMRAwDgYDVQQHEwdUb3JvbnRvMRgwFgYDVQQKEw9DZXJ0RW5n
+aW5lIEluYy4xKTAnBgNVBAsTIENlcnRpZmljYXRpb24gQXV0aG9yaXR5IERpdmlz
+aW9uMRMwEQYDVQQDEwpjZXJ0ZW5naW5lMSAwHgYJKoZIhvcNAQkBFhFjYUBjZXJ0
+ZW5naW5lLmNvbTAeFw05ODAxMDEwMDAwMDBaFw0zODAxMTcwMDAwMDBaMIGoMQsw
+CQYDVQQGEwJDQTELMAkGA1UECBMCT04xEDAOBgNVBAcTB1Rvcm9udG8xGDAWBgNV
+BAoTD0NlcnRFbmdpbmUgSW5jLjEpMCcGA1UECxMgQ2VydGlmaWNhdGlvbiBBdXRo
+b3JpdHkgRGl2aXNpb24xEzARBgNVBAMTCmNlcnRlbmdpbmUxIDAeBgkqhkiG9w0B
+CQEWEWNhQGNlcnRlbmdpbmUuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
+CgKCAQEA7aTXURShaeVt9u/dP3Q2dVib3jTCZvEyc6yfpGgaYWewXWuP4HOSfI4h
+GZblbpl+dzJc6RjhR+pguIRtbT5FJB8SJGjRqoujBEOQOxtVtc2fjM9Dqh0iOvMW
+WS6buxHG55GVrHAQaO5HXEScKQBa9ZyNmpSXPTEBrDMej1OAGOkc524/TZrgFPF4
+AiJLLkxCcP8NuzUKlW3WzNMSSoCtjkUKy4wjSLlAWCFM0T9Df6/+Z8ZUQTzHoKCD
+ncH5Qnynd7DlOwKQ2JwwxRhYGiGVTUN0GUq7qA11kW3+vnbFesKQXoF6o2PVx9s2
+YXviI2NXXUjZ0pVnsnFCc45Pm8XojwIDAQABoxAwDjAMBgNVHRMEBTADAQH/MA0G
+CSqGSIb3DQEBBQUAA4IBAQBP/aHOKJ00Akzc9HWM1X30hlWZFBaQi4pqD4Uhk8+p
+KzzwFP5DRLBOz8TYBbtdXrS6hxVMr2sqWmhVkuyepWhHZazKGyHY/y0FbOXsewAV
+1QxxSyx7ve89pCKv4/w0rQcP916iHc8Y/TCpmz7eITa3GId+8H/XTaBi8GBp9X9O
+w8m25FmEB1NT+eJwefvfdKowjy4tSorKdW/eJspxNuTSRGmUy8G71W5dYvgpAlx6
+mdnHyzxEGvRYNNI2bS0ifXgbEFNWqSas9q34ea5KOpkJu8T/KyXfSb6rPOsBSb0t
+wMowwGtCVH2C4Lw/8zo0EjhMpTOsPaub408PrZ+NQ2bl
+-----END CERTIFICATE-----
diff --git a/certs/eng3.pem b/certs/eng3.pem
new file mode 100644
index 0000000000..28bcce2dfd
--- /dev/null
+++ b/certs/eng3.pem
@@ -0,0 +1,34 @@
+-----BEGIN CERTIFICATE-----
+MIIF3TCCA8WgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBqDELMAkGA1UEBhMCQ0Ex
+CzAJBgNVBAgTAk9OMRAwDgYDVQQHEwdUb3JvbnRvMRgwFgYDVQQKEw9Gb3J0RW5n
+aW5lIEluYy4xKTAnBgNVBAsTIENlcnRpZmljYXRpb24gQXV0aG9yaXR5IERpdmlz
+aW9uMRMwEQYDVQQDEwpmb3J0ZW5naW5lMSAwHgYJKoZIhvcNAQkBFhFjYUBmb3J0
+ZW5naW5lLmNvbTAeFw05ODAxMDEwMDAwMDBaFw0zODAxMTcwMDAwMDBaMIGoMQsw
+CQYDVQQGEwJDQTELMAkGA1UECBMCT04xEDAOBgNVBAcTB1Rvcm9udG8xGDAWBgNV
+BAoTD0ZvcnRFbmdpbmUgSW5jLjEpMCcGA1UECxMgQ2VydGlmaWNhdGlvbiBBdXRo
+b3JpdHkgRGl2aXNpb24xEzARBgNVBAMTCmZvcnRlbmdpbmUxIDAeBgkqhkiG9w0B
+CQEWEWNhQGZvcnRlbmdpbmUuY29tMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIIC
+CgKCAgEAyr7GbpwDxx1v3EYbo0gcO+ligEhlDqG2e7u/AbWGoVAqc8+q6auUJUtz
+4i7oh0yNadu1o9kpXW+znkgO0zlrgjGskqqMO1ooppzTJdFy/P8gR6x1Iuv3kWtX
+OuzwPPEjv09LWlhyJsN+oU4ztTVf07I0Q9zYupcoDQ58XKRheI9KdDB2DYSmxywA
+WSLQwIeG0Qa7gvokeQlpkgkEC7viEecJ3752KXBJHnh7As51mxnlpmG6sDy67Eli
+HDw5tHETRqbtnscGBjskGQBqR5xt7+QnnthZrN8HJHDoa9zgGephwizhkL44lXLF
+YK9W5XhFbblw2c+mAcHkokRiwD7CPeIoyD2a/Jcw3n5hegKTlNhd4BFGVF6JR7gF
+OFk2QfHXit5uthsij9Xhl7WAgQUqLgggD9MphqPf4nY66OZUJV9ZsmB+Qfp8UizB
+0WAOegactKVyRqHtRa+KIEXQXNtZgjcmMk9CYkP0nIbKtgKXaH6+9VMHNOryCnFE
+7pSsuPUkypncFWCHGSeiFO3w4w4J4csltxBADQzxfRu5KZnlToQN7bVpI/Q31tVX
+E5bjrJcq6Oj/OTqZ3ID+OqbkUdAg0ggjRKcTgxnLHd/AbMzJ6PsclDDf7cLs0WSl
+xMxQR/z5bNST1rNtT9rsiv2TOhfvCBxO9AOjBioO8PLO032HTNECAwEAAaMQMA4w
+DAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQUFAAOCAgEAVyBpPWfT2VOyvVpslGKx
+8h0+CWP8cilygGRtZJ5dAJzc//1REAHdvK+TgZ4Foz3dqHhXI+RNN0FpzuWaYMjW
+ZTS0kAmcOQuGY1Oo4PGlPHI21pNz29oFDTJr0ZmLBJ4JKVsE2soJg55jdk9MZHA7
+K//7HH9RsmrWZOE5DZDlrxp6+naixhMwnlPKKisIy9GNZUPqGdUWABMdB/BUVVNl
+NU5TtWpIXUClMd8a+eoKcItBeYXowkHOBpinPkDX3clFDIUfWiw0Ro08s8SrrFqR
+8Szwbrj52Xv1RM56oGqCjnkvJctxihODV7NcpxoAFjIZokDom0q6zPrrTUsLFQov
+Plovc3w5hmALiDMshaTvE1nm3Psn4yQ+FlRE8epTZrQiIGypZkZC6lcz0mYawueW
+cThYWGFhVG4ktQzOjjNRsNxopW+W7cF1zQTxiWUDnxIKSj7gtdQ2jiubxEEhfVag
+r8DMtAccNVTZVURpGi56TptOOuotrTqqC+2GviW4hlxvdvmuQN0OlXlUwzz2Trxc
+FamNnuA54lZw/8arLtxsFmHrcnPw53+1spumLD0S5UkxHNu40h6LIVpZz3H+0rLz
+uFofTfiyMjcfK2AyHQTgUCbsrvgNuLDQUbyFGVchdFUkhztX3DhEVnxnnrpY4BVj
+QdTqWIvw7lGlSuDCjxEQAOc=
+-----END CERTIFICATE-----
diff --git a/certs/eng4.pem b/certs/eng4.pem
new file mode 100644
index 0000000000..9a7b156226
--- /dev/null
+++ b/certs/eng4.pem
@@ -0,0 +1,23 @@
+-----BEGIN CERTIFICATE-----
+MIID3TCCAsWgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBqDELMAkGA1UEBhMCQ0Ex
+CzAJBgNVBAgTAk9OMRAwDgYDVQQHEwdUb3JvbnRvMRgwFgYDVQQKEw9NYWlsRW5n
+aW5lIEluYy4xKTAnBgNVBAsTIENlcnRpZmljYXRpb24gQXV0aG9yaXR5IERpdmlz
+aW9uMRMwEQYDVQQDEwptYWlsZW5naW5lMSAwHgYJKoZIhvcNAQkBFhFjYUBtYWls
+ZW5naW5lLmNvbTAeFw05ODAxMDEwMDAwMDBaFw0zODAxMTcwMDAwMDBaMIGoMQsw
+CQYDVQQGEwJDQTELMAkGA1UECBMCT04xEDAOBgNVBAcTB1Rvcm9udG8xGDAWBgNV
+BAoTD01haWxFbmdpbmUgSW5jLjEpMCcGA1UECxMgQ2VydGlmaWNhdGlvbiBBdXRo
+b3JpdHkgRGl2aXNpb24xEzARBgNVBAMTCm1haWxlbmdpbmUxIDAeBgkqhkiG9w0B
+CQEWEWNhQG1haWxlbmdpbmUuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
+CgKCAQEAqXmfsU+lx+NFmn6tN17RTOyaddHqLnr/3rzEDIyT9TN+tF9TG7jmK7lJ
+Jrj5arQ3nTFaLF8JuND2U1z/cLPw6/TX+1tE3v3CNUDSjaisyUDiUyp3TE8hMMMz
+zfZQn0JsGgNhhWxqyzjhRQGtKL4+xtn8VsF/8zGgZYke7nlmVKz/FslDFTnNoodL
+BAEGiu9JQS9qqpbSs20NdZ6LXPL2A4iTjnsNFBW3jIMVIn/JVVyaycU7ue2oFviD
+vLNpkVZcR7A+jjIdIumOc5VSF0y7y74cQC5YwkR2mLK7UBYDK6NCY3ta/C4M8NsM
+0FpmvRl0+A1ivZtVwqI98dxDtp7HeQIDAQABoxAwDjAMBgNVHRMEBTADAQH/MA0G
+CSqGSIb3DQEBBQUAA4IBAQAjfNn5BCzxylBDakFQGWKE/P43PRibMOEzfd7+DzbY
+WIekoz3i00DwoH3b6j4gwlDJRAOq4dF6/Pt/uBOHDo/op+ef+9ErmKPd+ehXN9h3
+7QbccTgz7DtVwA4iRlDRLru+JuXzT+OsCHuFZMOLJ+KD2JAGh3W68JjdcLkrlcpt
+AU0wc5aOHPPfEBdIah8y8QtNzXRVzoBt8zzvgCARkXxTS2u/9QaXR1hML0JtDgQS
+SdZ6Kd8SN6yzqxD+buYD5sOfJmjBF/n3lqFHNMHnnGXy2TAXZtIAWzffU3A0cGPB
+N6FZ026a86HbF1X4k+xszhbJu/ikczyuWnCJIg3fTYSD
+-----END CERTIFICATE-----
diff --git a/certs/eng5.pem b/certs/eng5.pem
new file mode 100644
index 0000000000..3416ccad24
--- /dev/null
+++ b/certs/eng5.pem
@@ -0,0 +1,23 @@
+-----BEGIN CERTIFICATE-----
+MIID6TCCAtGgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBrjELMAkGA1UEBhMCQ0Ex
+CzAJBgNVBAgTAk9OMRAwDgYDVQQHEwdUb3JvbnRvMRowGAYDVQQKExFUcmFkZXJF
+bmdpbmUgSW5jLjEpMCcGA1UECxMgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgRGl2
+aXNpb24xFTATBgNVBAMTDHRyYWRlcmVuZ2luZTEiMCAGCSqGSIb3DQEJARYTY2FA
+dHJhZGVyZW5naW5lLmNvbTAeFw05ODAxMDEwMDAwMDBaFw0zODAxMTcwMDAwMDBa
+MIGuMQswCQYDVQQGEwJDQTELMAkGA1UECBMCT04xEDAOBgNVBAcTB1Rvcm9udG8x
+GjAYBgNVBAoTEVRyYWRlckVuZ2luZSBJbmMuMSkwJwYDVQQLEyBDZXJ0aWZpY2F0
+aW9uIEF1dGhvcml0eSBEaXZpc2lvbjEVMBMGA1UEAxMMdHJhZGVyZW5naW5lMSIw
+IAYJKoZIhvcNAQkBFhNjYUB0cmFkZXJlbmdpbmUuY29tMIIBIjANBgkqhkiG9w0B
+AQEFAAOCAQ8AMIIBCgKCAQEAzyX5QE+5SN+zgNn1v3zp9HmP4hQOWW8WuEVItZVP
+9bt/xj5NeJd1kyPL/SqnF2qHcL3o/74r0Ga55aKHniwKYgQTlp5ELGfQ568QQeN9
+xNIHtUXeStI9zCNZyZC+4YqObdMR/ivKA/WsLfUVMl2lV5JzJJz1BOE0gKEYiEyz
+gIq5oLzkP/mOXoHRvWSZD2D0eHYIO7ovV2epVFK7g7p+dC4QoeIUEli+GF/Myg88
+dV/qmi+Sybck2RLPXa8Nh27/ETVQ7kE1Eafmx7EyCqIhG+5lwJAy3HwHUBwAYuzj
+iuZz5lD8aQmr8SKuvy3eOH9SVN5wh3YBlrNGwTStkESVLwIDAQABoxAwDjAMBgNV
+HRMEBTADAQH/MA0GCSqGSIb3DQEBBQUAA4IBAQAWOPAUhZd3x9EQiFJcuxFTMd9q
+axgcriCzJsM6D96sYGko9xTeLhX/lr1bliVYI5AlupoLXAdMzGHJkOgaTirKjQXr
+F9nymDdUWKe3TmwGob5016nQlH7qRKvGO3hka0rOGRK2U/2JT/4Qp8iH/DFi6cyM
+uP0q8n64SAkxZXLzUuFQXqf7U/SNjzb9XJQEIAdjp7eYd3Qb4jDsDcX0FrKMF1aV
+r0dCDnS7am7WTXPYCDGdSkPgEHEtLYIYH3lZp5sKdVZ9wl4F0WNFkRWRUr7AXPjw
+50uLmUNmKCd8JZLMGA1TRNSTi7U9EcrWt0OkMWm74T2WVnAgNsDv2WrWsGfj
+-----END CERTIFICATE-----
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
new file mode 100644
index 0000000000..307c7ccb35
--- /dev/null
+++ b/crypto/bn/asm/ppc.pl
@@ -0,0 +1,2081 @@
+#!/usr/bin/env perl
+#
+# Implemented as a Perl wrapper as we want to support several different
+# architectures with single file. We pick up the target based on the
+# file name we are asked to generate.
+#
+# It should be noted though that this perl code is nothing like
+# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
+# as pre-processor to cover for platform differences in name decoration,
+# linker tables, 32-/64-bit instruction sets...
+#
+# As you might know there're several PowerPC ABI in use. Most notably
+# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
+# are similar enough to implement leaf(!) functions, which would be ABI
+# neutral. And that's what you find here: ABI neutral leaf functions.
+# In case you wonder what that is...
+#
+# AIX performance
+#
+# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
+#
+# The following is the performance of 32-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6c 21 dec 2001
+# built on: Tue Jun 11 11:06:51 EDT 2002
+# options:bn(64,32) ...
+#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
+#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
+#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
+#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
+#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
+#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
+#
+# Same bechmark with this assembler code:
+#
+#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
+#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
+#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
+#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
+#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
+#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
+#
+# Number of operations increases by at almost 75%
+#
+# Here are performance numbers for 64-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6g [engine] 9 Aug 2002
+# built on: Fri Apr 18 16:59:20 EDT 2003
+# options:bn(64,64) ...
+# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
+#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
+#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
+#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
+#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
+#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
+#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
+#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
+#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
+#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
+#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
+#
+# Again, performance increases by at about 75%
+#
+# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
+# OpenSSL 0.9.7c 30 Sep 2003
+#
+# Original code.
+#
+#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
+#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
+#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
+#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
+#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
+#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
+#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
+#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
+#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
+#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
+#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
+#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
+#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
+#
+# Performance increase of ~60%
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari@us.ibm.com
+#
+
+$opf = shift;
+
+if ($opf =~ /32\.s/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc\"";
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UDIV= "divwu"; # unsigned divide
+ $UCMPI= "cmplwi"; # unsigned compare with immediate
+ $UCMP= "cmplw"; # unsigned compare
+ $COUNTZ="cntlzw"; # count leading zeros
+ $SHL= "slw"; # shift left
+ $SHR= "srw"; # unsigned shift right
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHLI= "slwi"; # shift left by immediate
+ $CLRU= "clrlwi"; # clear upper bits
+ $INSR= "insrwi"; # insert right
+ $ROTL= "rotlwi"; # rotate left by immediate
+} elsif ($opf =~ /64\.s/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc64\"";
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UDIV= "divdu"; # unsigned divide
+ $UCMPI= "cmpldi"; # unsigned compare with immediate
+ $UCMP= "cmpld"; # unsigned compare
+ $COUNTZ="cntlzd"; # count leading zeros
+ $SHL= "sld"; # shift left
+ $SHR= "srd"; # unsigned shift right
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHLI= "sldi"; # shift left by immediate
+ $CLRU= "clrldi"; # clear upper bits
+ $INSR= "insrdi"; # insert right
+ $ROTL= "rotldi"; # rotate left by immediate
+} else { die "nonsense $opf"; }
+
+( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+
+# function entry points from the AIX code
+#
+# There are other, more elegant, ways to handle this. We (IBM) chose
+# this approach as it plays well with scripts we run to 'namespace'
+# OpenSSL .i.e. we add a prefix to all the public symbols so we can
+# co-exist in the same process with other implementations of OpenSSL.
+# 'cleverer' ways of doing these substitutions tend to hide data we
+# need to be obvious.
+#
+my @items = ("bn_sqr_comba4",
+ "bn_sqr_comba8",
+ "bn_mul_comba4",
+ "bn_mul_comba8",
+ "bn_sub_words",
+ "bn_add_words",
+ "bn_div_words",
+ "bn_sqr_words",
+ "bn_mul_words",
+ "bn_mul_add_words");
+
+if ($opf =~ /linux/) { do_linux(); }
+elsif ($opf =~ /aix/) { do_aix(); }
+elsif ($opf =~ /osx/) { do_osx(); }
+else { do_bsd(); }
+
+sub do_linux {
+ $d=&data();
+
+ if ($BITS==64) {
+ foreach $t (@items) {
+ $d =~ s/\.$t:/\
+\t.section\t".opd","aw"\
+\t.align\t3\
+\t.globl\t$t\
+$t:\
+\t.quad\t.$t,.TOC.\@tocbase,0\
+\t.size\t$t,24\
+\t.previous\n\
+\t.type\t.$t,\@function\
+\t.globl\t.$t\
+.$t:/g;
+ }
+ }
+ else {
+ foreach $t (@items) {
+ $d=~s/\.$t/$t/g;
+ }
+ }
+ # hide internal labels to avoid pollution of name table...
+ $d=~s/Lppcasm_/.Lppcasm_/gm;
+ print $d;
+}
+
+sub do_aix {
+ # AIX assembler is smart enough to please the linker without
+ # making us do something special...
+ print &data();
+}
+
+# MacOSX 32 bit
+sub do_osx {
+ $d=&data();
+ # Change the bn symbol prefix from '.' to '_'
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ # Change .machine to something OS X asm will accept
+ $d=~s/\.machine.*/.text/g;
+ $d=~s/\#/;/g; # change comment from '#' to ';'
+ print $d;
+}
+
+# BSD (Untested)
+sub do_bsd {
+ $d=&data();
+ foreach $t (@items) {
+ $d=~s/\.$t/_$t/g;
+ }
+ print $d;
+}
+
+sub data {
+ local($data)=<<EOF;
+#--------------------------------------------------------------------
+#
+#
+#
+#
+# File: ppc32.s
+#
+# Created by: Suresh Chari
+# IBM Thomas J. Watson Research Library
+# Hawthorne, NY
+#
+#
+# Description: Optimized assembly routines for OpenSSL crypto
+# on the 32 bitPowerPC platform.
+#
+#
+# Version History
+#
+# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
+# cleaned up code. Also made a single version which can
+# be used for both the AIX and Linux compilers. See NOTE
+# below.
+# 12/05/03 Suresh Chari
+# (with lots of help from) Andy Polyakov
+##
+# 1. Initial version 10/20/02 Suresh Chari
+#
+#
+# The following file works for the xlc,cc
+# and gcc compilers.
+#
+# NOTE: To get the file to link correctly with the gcc compiler
+# you have to change the names of the routines and remove
+# the first .(dot) character. This should automatically
+# be done in the build process.
+#
+# Hand optimized assembly code for the following routines
+#
+# bn_sqr_comba4
+# bn_sqr_comba8
+# bn_mul_comba4
+# bn_mul_comba8
+# bn_sub_words
+# bn_add_words
+# bn_div_words
+# bn_sqr_words
+# bn_mul_words
+# bn_mul_add_words
+#
+# NOTE: It is possible to optimize this code more for
+# specific PowerPC or Power architectures. On the Northstar
+# architecture the optimizations in this file do
+# NOT provide much improvement.
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari\@us.ibm.com
+#
+#--------------------------------------------------------------------------
+#
+# Defines to be used in the assembly code.
+#
+.set r0,0 # we use it as storage for value of 0
+.set SP,1 # preserved
+.set RTOC,2 # preserved
+.set r3,3 # 1st argument/return value
+.set r4,4 # 2nd argument/volatile register
+.set r5,5 # 3rd argument/volatile register
+.set r6,6 # ...
+.set r7,7
+.set r8,8
+.set r9,9
+.set r10,10
+.set r11,11
+.set r12,12
+.set r13,13 # not used, nor any other "below" it...
+
+.set BO_IF_NOT,4
+.set BO_IF,12
+.set BO_dCTR_NZERO,16
+.set BO_dCTR_ZERO,18
+.set BO_ALWAYS,20
+.set CR0_LT,0;
+.set CR0_GT,1;
+.set CR0_EQ,2
+.set CR1_FX,4;
+.set CR1_FEX,5;
+.set CR1_VX,6
+.set LR,8
+
+# Declare function names to be global
+# NOTE: For gcc these names MUST be changed to remove
+# the first . i.e. for example change ".bn_sqr_comba4"
+# to "bn_sqr_comba4". This should be automatically done
+# in the build.
+
+ .globl .bn_sqr_comba4
+ .globl .bn_sqr_comba8
+ .globl .bn_mul_comba4
+ .globl .bn_mul_comba8
+ .globl .bn_sub_words
+ .globl .bn_add_words
+ .globl .bn_div_words
+ .globl .bn_sqr_words
+ .globl .bn_mul_words
+ .globl .bn_mul_add_words
+
+# .text section
+
+ .machine $ISA
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba4:
+#
+# Optimized version of bn_sqr_comba4.
+#
+# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+# Here's the assembly
+#
+#
+ xor r0,r0,r0 # set r0 = 0. Used in the addze
+ # instructions below
+
+ #sqr_add_c(a,0,c1,c2,c3)
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5
+ $UMULH r10,r5,r5 #in first iteration. No need
+ #to add since c1=c2=c3=0.
+ # Note c3(r11) is NOT set to 0
+ # but will be.
+
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ # sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
+ adde r8,r8,r8
+ addze r9,r0 # catch carry if any.
+ # r9= r0(=0) and carry
+
+ addc r10,r7,r10 # now add to temp result.
+ addze r11,r8 # r8 added to r11 which is 0
+ addze r9,r9
+
+ $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
+ #sqr_add_c(a,1,c3,c1,c2)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5] = c3
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba8:
+#
+# This is an optimized version of the bn_sqr_comba8 routine.
+# Tightly uses the adde instruction
+#
+#
+# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+#
+# Possible optimization of loading all 8 longs of a into registers
+# doesnt provide any speedup
+#
+
+ xor r0,r0,r0 #set r0 = 0.Used in addze
+ #instructions below.
+
+ #sqr_add_c(a,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5 #1st iteration: no carries.
+ $UMULH r10,r5,r5
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ #sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r0 # (r8,r7) to the three register
+ addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r11 # (r8,r7) to the three register
+ addze r9,r9 # number (r9,r11,r10).
+
+ $ST r10,`1*$BNSZ`(r3) # r[1]=c2
+
+ #sqr_add_c(a,1,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,0,c2,c3,c1);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
+ #sqr_add_c2(a,5,0,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,4,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,4,2,c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
+ #sqr_add_c2(a,7,0,c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,6,1,c2,c3,c1);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,5,2,c2,c3,c1);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
+ #sqr_add_c(a,4,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,5,3,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,7,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
+ #sqr_add_c2(a,7,2,c1,c2,c3);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,3,c1,c2,c3);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,4,c1,c2,c3);
+ $LD r5,`4*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
+ #sqr_add_c(a,5,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,6,4,c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,7,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
+ #sqr_add_c2(a,7,4,c3,c1,c2);
+ $LD r5,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,5,c3,c1,c2);
+ $LD r5,`5*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
+ #sqr_add_c(a,6,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,7,5,c1,c2,c3)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
+
+ #sqr_add_c2(a,7,6,c2,c3,c1)
+ $LD r5,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
+ #sqr_add_c(a,7,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
+ $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
+
+
+ bclr BO_ALWAYS,CR0_LT
+
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba4:
+#
+# This is an optimized version of the bn_mul_comba4 routine.
+#
+# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r0
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6, `1*$BNSZ`(r4)
+ $LD r7, `0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r0
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3
+ #mul_add_c(a[3],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+
+ $ST r10,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r11,`7*$BNSZ`(r3) #r[7]=c2
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba8:
+#
+# Optimized version of the bn_mul_comba8 routine.
+#
+# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4) #a[0]
+ $LD r7,`0*$BNSZ`(r5) #b[0]
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ addze r12,r9 # since we didnt set r12 to zero before.
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
+ #mul_add_c(a[4],b[0],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[0],b[4],c2,c3,c1);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
+ #mul_add_c(a[0],b[5],c3,c1,c2);
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[4],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[4],b[1],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[5],b[0],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
+ #mul_add_c(a[6],b[0],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[5],b[1],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[4],b[2],c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[3],b[3],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[2],b[4],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[1],b[5],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[0],b[6],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
+ #mul_add_c(a[0],b[7],c2,c3,c1);
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[1],b[6],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[2],b[5],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[3],b[4],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[4],b[3],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[5],b[2],c2,c3,c1);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[6],b[1],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[7],b[0],c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
+ #mul_add_c(a[7],b[1],c3,c1,c2);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[6],b[2],c3,c1,c2);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[5],b[3],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[4],b[4],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[3],b[5],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[2],b[6],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[1],b[7],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
+ #mul_add_c(a[2],b[7],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[3],b[6],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[4],b[5],c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[5],b[4],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[6],b[3],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[7],b[2],c1,c2,c3);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
+ #mul_add_c(a[7],b[3],c2,c3,c1);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[6],b[4],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[5],b[5],c2,c3,c1);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[4],b[6],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[3],b[7],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
+ #mul_add_c(a[4],b[7],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[5],b[6],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[6],b[5],c3,c1,c2);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[7],b[4],c3,c1,c2);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
+ #mul_add_c(a[7],b[5],c1,c2,c3);
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[6],b[6],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[5],b[7],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
+ #mul_add_c(a[6],b[7],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[7],b[6],c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
+ #mul_add_c(a[7],b[7],c3,c1,c2);
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
+ $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sub_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+#
+.align 4
+.bn_sub_words:
+#
+# Handcoded version of bn_sub_words
+#
+#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = b
+# r6 = n
+#
+# Note: No loop unrolling done since this is not a performance
+# critical loop.
+
+ xor r0,r0,r0 #set r0 = 0
+#
+# check for r6 = 0 AND set carry bit.
+#
+ subfc. r7,r0,r6 # If r6 is 0 then result is 0.
+ # if r6 > 0 then result !=0
+ # In either case carry bit is set.
+ bc BO_IF,CR0_EQ,Lppcasm_sub_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ addi r5,r5,-$BNSZ
+ mtctr r6
+Lppcasm_sub_mainloop:
+ $LDU r7,$BNSZ(r4)
+ $LDU r8,$BNSZ(r5)
+ subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
+ # if carry = 1 this is r7-r8. Else it
+ # is r7-r8 -1 as we need.
+ $STU r6,$BNSZ(r3)
+ bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+Lppcasm_sub_adios:
+ subfze r3,r0 # if carry bit is set then r3 = 0 else -1
+ andi. r3,r3,1 # keep only last bit.
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+
+#
+# NOTE: The following label name should be changed to
+# "bn_add_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_add_words:
+#
+# Handcoded version of bn_add_words
+#
+#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = b
+# r6 = n
+#
+# Note: No loop unrolling done since this is not a performance
+# critical loop.
+
+ xor r0,r0,r0
+#
+# check for r6 = 0. Is this needed?
+#
+ addic. r6,r6,0 #test r6 and clear carry bit.
+ bc BO_IF,CR0_EQ,Lppcasm_add_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ addi r5,r5,-$BNSZ
+ mtctr r6
+Lppcasm_add_mainloop:
+ $LDU r7,$BNSZ(r4)
+ $LDU r8,$BNSZ(r5)
+ adde r8,r7,r8
+ $STU r8,$BNSZ(r3)
+ bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+Lppcasm_add_adios:
+ addze r3,r0 #return carry bit.
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_div_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_div_words:
+#
+# This is a cleaned up version of code generated by
+# the AIX compiler. The only optimization is to use
+# the PPC instruction to count leading zeros instead
+# of call to num_bits_word. Since this was compiled
+# only at level -O2 we can possibly squeeze it more?
+#
+# r3 = h
+# r4 = l
+# r5 = d
+
+ $UCMPI 0,r5,0 # compare r5 and 0
+ bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0
+ li r3,-1 # d=0 return -1
+ bclr BO_ALWAYS,CR0_LT
+Lppcasm_div1:
+ xor r0,r0,r0 #r0=0
+ $COUNTZ r7,r5 #r7 = num leading 0s in d.
+ subfic r8,r7,$BITS #r8 = BN_num_bits_word(d)
+ cmpi 0,0,r8,$BITS #
+ bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if (r8==$BITS)
+ li r9,1 # r9=1
+ $SHL r10,r9,r8 # r9<<=r8
+ $UCMP 0,r3,r10 #
+ bc BO_IF,CR0_GT,Lppcasm_div2 #or if (h > (1<<r8))
+ $UDIV r3,r3,r0 #if not assert(0) divide by 0!
+ #that's how we signal overflow
+ bclr BO_ALWAYS,CR0_LT #return. NEVER REACHED.
+Lppcasm_div2:
+ $UCMP 0,r3,r5 #h>=d?
+ bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not
+ subf r3,r5,r3 #h-=d ;
+Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
+ cmpi 0,0,r7,0 # is (i == 0)?
+ bc BO_IF,CR0_EQ,Lppcasm_div4
+ $SHL r3,r3,r7 # h = (h<< i)
+ $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
+ $SHL r5,r5,r7 # d<<=i
+ or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
+ $SHL r4,r4,r7 # l <<=i
+Lppcasm_div4:
+ $SHRI r9,r5,`$BITS/2` # r9 = dh
+ # dl will be computed when needed
+ # as it saves registers.
+ li r6,2 #r6=2
+ mtctr r6 #counter will be in count.
+Lppcasm_divouterloop:
+ $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
+ $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
+ # compute here for innerloop.
+ $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
+ bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not
+
+ li r8,-1
+ $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
+ b Lppcasm_div6
+Lppcasm_div5:
+ $UDIV r8,r3,r9 #q = h/dh
+Lppcasm_div6:
+ $UMULL r12,r9,r8 #th = q*dh
+ $CLRU r10,r5,`$BITS/2` #r10=dl
+ $UMULL r6,r8,r10 #tl = q*dl
+
+Lppcasm_divinnerloop:
+ subf r10,r12,r3 #t = h -th
+ $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
+ addic. r7,r7,0 #test if r7 == 0. used below.
+ # now want to compute
+ # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
+ # the following 2 instructions do that
+ $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
+ or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
+ $UCMP 1,r6,r7 # compare (tl <= r7)
+ bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
+ bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+ addi r8,r8,-1 #q--
+ subf r12,r9,r12 #th -=dh
+ $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
+ subf r6,r10,r6 #tl -=dl
+ b Lppcasm_divinnerloop
+Lppcasm_divinnerexit:
+ $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
+ $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
+ $UCMP 1,r4,r11 # compare l and tl
+ add r12,r12,r10 # th+=t
+ bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
+ addi r12,r12,1 # th++
+Lppcasm_div7:
+ subf r11,r11,r4 #r11=l-tl
+ $UCMP 1,r3,r12 #compare h and th
+ bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
+ addi r8,r8,-1 # q--
+ add r3,r5,r3 # h+=d
+Lppcasm_div8:
+ subf r12,r12,r3 #r12 = h-th
+ $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
+ # want to compute
+ # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
+ # the following 2 instructions will do this.
+ $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
+ $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
+ bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+ $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
+ b Lppcasm_divouterloop
+Lppcasm_div9:
+ or r3,r8,r0
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+.align 4
+.bn_sqr_words:
+#
+# Optimized version of bn_sqr_words
+#
+# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = n
+#
+# r6 = a[i].
+# r7,r8 = product.
+#
+# No unrolling done here. Not performance critical.
+
+ addic. r5,r5,0 #test r5.
+ bc BO_IF,CR0_EQ,Lppcasm_sqr_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ mtctr r5
+Lppcasm_sqr_mainloop:
+ #sqr(r[0],r[1],a[0]);
+ $LDU r6,$BNSZ(r4)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ $STU r7,$BNSZ(r3)
+ $STU r8,$BNSZ(r3)
+ bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+Lppcasm_sqr_adios:
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_words:
+#
+# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+#
+# r3 = rp
+# r4 = ap
+# r5 = num
+# r6 = w
+ xor r0,r0,r0
+ xor r12,r12,r12 # used for carry
+ rlwinm. r7,r5,30,2,31 # num >> 2
+ bc BO_IF,CR0_EQ,Lppcasm_mw_REM
+ mtctr r7
+Lppcasm_mw_LOOP:
+ #mul(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ #addze r10,r10 #carry is NOT ignored.
+ #will be taken care of
+ #in second spin below
+ #using adde.
+ $ST r9,`0*$BNSZ`(r3)
+ #mul(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ #addze r12,r12
+ $ST r11,`1*$BNSZ`(r3)
+ #mul(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ adde r9,r9,r12
+ #addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+ #mul_add(rp[3],ap[3],w,c1);
+ $LD r8,`3*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ addze r12,r12 #this spin we collect carry into
+ #r12
+ $ST r11,`3*$BNSZ`(r3)
+
+ addi r3,r3,`4*$BNSZ`
+ addi r4,r4,`4*$BNSZ`
+ bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+
+Lppcasm_mw_REM:
+ andi. r5,r5,0x3
+ bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ #mul(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`0*$BNSZ`(r3)
+ addi r12,r10,0
+
+ addi r5,r5,-1
+ cmpli 0,0,r5,0
+ bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+
+
+ #mul(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`1*$BNSZ`(r3)
+ addi r12,r10,0
+
+ addi r5,r5,-1
+ cmpli 0,0,r5,0
+ bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+
+ #mul_add(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+ addi r12,r10,0
+
+Lppcasm_mw_OVER:
+ addi r3,r12,0
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_add_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_add_words:
+#
+# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+#
+# r3 = rp
+# r4 = ap
+# r5 = num
+# r6 = w
+#
+# empirical evidence suggests that unrolled version performs best!!
+#
+ xor r0,r0,r0 #r0 = 0
+ xor r12,r12,r12 #r12 = 0 . used for carry
+ rlwinm. r7,r5,30,2,31 # num >> 2
+ bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
+ mtctr r7
+Lppcasm_maw_mainloop:
+ #mul_add(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $LD r11,`0*$BNSZ`(r3)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12 #r12 is carry.
+ addze r10,r10
+ addc r9,r9,r11
+ #addze r10,r10
+ #the above instruction addze
+ #is NOT needed. Carry will NOT
+ #be ignored. It's not affected
+ #by multiply and will be collected
+ #in the next spin
+ $ST r9,`0*$BNSZ`(r3)
+
+ #mul_add(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $LD r9,`1*$BNSZ`(r3)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10 #r10 is carry.
+ addze r12,r12
+ addc r11,r11,r9
+ #addze r12,r12
+ $ST r11,`1*$BNSZ`(r3)
+
+ #mul_add(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $LD r11,`2*$BNSZ`(r3)
+ $UMULH r10,r6,r8
+ adde r9,r9,r12
+ addze r10,r10
+ addc r9,r9,r11
+ #addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+
+ #mul_add(rp[3],ap[3],w,c1);
+ $LD r8,`3*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $LD r9,`3*$BNSZ`(r3)
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ addze r12,r12
+ addc r11,r11,r9
+ addze r12,r12
+ $ST r11,`3*$BNSZ`(r3)
+ addi r3,r3,`4*$BNSZ`
+ addi r4,r4,`4*$BNSZ`
+ bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+
+Lppcasm_maw_leftover:
+ andi. r5,r5,0x3
+ bc BO_IF,CR0_EQ,Lppcasm_maw_adios
+ addi r3,r3,-$BNSZ
+ addi r4,r4,-$BNSZ
+ #mul_add(rp[0],ap[0],w,c1);
+ mtctr r5
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+ bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ #mul_add(rp[1],ap[1],w,c1);
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+ bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ #mul_add(rp[2],ap[2],w,c1);
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+Lppcasm_maw_adios:
+ addi r3,r12,0
+ bclr BO_ALWAYS,CR0_LT
+ .long 0x00000000
+ .align 4
+EOF
+ $data =~ s/\`([^\`]*)\`/eval $1/gem;
+
+ # if some assembler chokes on some simplified mnemonic,
+ # this is the spot to fix it up, e.g.:
+ # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
+ $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
+ # assembler X doesn't accept li, load immediate value
+ #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
+ return($data);
+}
diff --git a/crypto/evp/e_old.c b/crypto/evp/e_old.c
new file mode 100644
index 0000000000..66f3bf4276
--- /dev/null
+++ b/crypto/evp/e_old.c
@@ -0,0 +1,114 @@
+/* crypto/evp/e_old.c -*- mode:C; c-file-style: "eay" -*- */
+/* Written by Richard Levitte (richard@levitte.org) for the OpenSSL
+ * project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#ifdef OPENSSL_NO_DEPRECATED
+static void *dummy = &dummy;
+#else
+
+#include <openssl/evp.h>
+
+/* Define some deprecated functions, so older programs
+ don't crash and burn too quickly. On Windows and VMS,
+ these will never be used, since functions and variables
+ in shared libraries are selected by entry point location,
+ not by name. */
+
+#ifndef OPENSSL_NO_BF
+#undef EVP_bf_cfb
+const EVP_CIPHER *EVP_bf_cfb(void) { return EVP_bf_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_DES
+#undef EVP_des_cfb
+const EVP_CIPHER *EVP_des_cfb(void) { return EVP_des_cfb64(); }
+#undef EVP_des_ede3_cfb
+const EVP_CIPHER *EVP_des_ede3_cfb(void) { return EVP_des_ede3_cfb64(); }
+#undef EVP_des_ede_cfb
+const EVP_CIPHER *EVP_des_ede_cfb(void) { return EVP_des_ede_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_IDEA
+#undef EVP_idea_cfb
+const EVP_CIPHER *EVP_idea_cfb(void) { return EVP_idea_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_RC2
+#undef EVP_rc2_cfb
+const EVP_CIPHER *EVP_rc2_cfb(void) { return EVP_rc2_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_CAST5
+#undef EVP_cast5_cfb
+const EVP_CIPHER *EVP_cast5_cfb(void) { return EVP_cast5_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_RC5
+#undef EVP_rc5_32_12_16_cfb
+const EVP_CIPHER *EVP_rc5_32_12_16_cfb(void) { return EVP_rc5_32_12_16_cfb64(); }
+#endif
+
+#ifndef OPENSSL_NO_AES
+#undef EVP_aes_128_cfb
+const EVP_CIPHER *EVP_aes_128_cfb(void) { return EVP_aes_128_cfb128(); }
+#undef EVP_aes_192_cfb
+const EVP_CIPHER *EVP_aes_192_cfb(void) { return EVP_aes_192_cfb128(); }
+#undef EVP_aes_256_cfb
+const EVP_CIPHER *EVP_aes_256_cfb(void) { return EVP_aes_256_cfb128(); }
+#endif
+
+#endif
diff --git a/crypto/sha/asm/sha512-sse2.pl b/crypto/sha/asm/sha512-sse2.pl
new file mode 100644
index 0000000000..797aedacd7
--- /dev/null
+++ b/crypto/sha/asm/sha512-sse2.pl
@@ -0,0 +1,391 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# SHA512_Transform_SSE2.
+#
+# As the name suggests, this is an IA-32 SSE2 implementation of
+# SHA512_Transform. Motivating factor for the undertaken effort was that
+# SHA512 was observed to *consistently* perform *significantly* poorer
+# than SHA256 [2x and slower is common] on 32-bit platforms. On 64-bit
+# platforms on the other hand SHA512 tend to outperform SHA256 [~50%
+# seem to be common improvement factor]. All this is perfectly natural,
+# as SHA512 is a 64-bit algorithm. But isn't IA-32 SSE2 essentially
+# a 64-bit instruction set? Is it rich enough to implement SHA512?
+# If answer was "no," then you wouldn't have been reading this...
+#
+# Throughput performance in MBps (larger is better):
+#
+# 2.4GHz P4 1.4GHz AMD32 1.4GHz AMD64(*)
+# SHA256/gcc(*) 54 43 59
+# SHA512/gcc 17 23 92
+# SHA512/sse2 54(**) 55(**)
+# SHA512/icc 26 28
+# SHA256/icc(*) 65 54
+#
+# (*) AMD64 and SHA256 numbers are presented mostly for amusement or
+# reference purposes.
+# (**) I.e. it gives ~2-3x speed-up if compared with compiler generated
+# code. One can argue that hand-coded *non*-SSE2 implementation
+# would perform better than compiler generated one as well, and
+# that comparison is therefore not exactly fair. Well, as SHA512
+# puts enormous pressure on IA-32 GP register bank, I reckon that
+# hand-coded version wouldn't perform significantly better than
+# one compiled with icc, ~20% perhaps... So that this code would
+# still outperform it with distinguishing marginal. But feel free
+# to prove me wrong:-)
+# <appro@fy.chalmers.se>
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"sha512-sse2.pl",$ARGV[$#ARGV] eq "386");
+
+$K512="esi"; # K512[80] table, found at the end...
+#$W512="esp"; # $W512 is not just W512[16]: it comprises *two* copies
+ # of W512[16] and a copy of A-H variables...
+$W512_SZ=8*(16+16+8); # see above...
+#$Kidx="ebx"; # index in K512 table, advances from 0 to 80...
+$Widx="edx"; # index in W512, wraps around at 16...
+$data="edi"; # 16 qwords of input data...
+$A="mm0"; # B-D and
+$E="mm1"; # F-H are allocated dynamically...
+$Aoff=256+0; # A-H offsets relative to $W512...
+$Boff=256+8;
+$Coff=256+16;
+$Doff=256+24;
+$Eoff=256+32;
+$Foff=256+40;
+$Goff=256+48;
+$Hoff=256+56;
+
+sub SHA2_ROUND()
+{ local ($kidx,$widx)=@_;
+
+ # One can argue that one could reorder instructions for better
+ # performance. Well, I tried and it doesn't seem to make any
+ # noticeable difference. Modern out-of-order execution cores
+ # reorder instructions to their liking in either case and they
+ # apparently do decent job. So we can keep the code more
+ # readable/regular/comprehensible:-)
+
+ # I adhere to 64-bit %mmX registers in order to avoid/not care
+ # about #GP exceptions on misaligned 128-bit access, most
+ # notably in paddq with memory operand. Not to mention that
+ # SSE2 intructions operating on %mmX can be scheduled every
+ # cycle [and not every second one if operating on %xmmN].
+
+ &movq ("mm4",&QWP($Foff,$W512)); # load f
+ &movq ("mm5",&QWP($Goff,$W512)); # load g
+ &movq ("mm6",&QWP($Hoff,$W512)); # load h
+ &movq (&QWP($Foff,$W512),$E); # f = e
+ &movq (&QWP($Goff,$W512),"mm4"); # g = f
+ &movq (&QWP($Hoff,$W512),"mm5"); # h = g
+
+ &movq ("mm2",$E); # %mm2 is sliding right
+ &movq ("mm3",$E); # %mm3 is sliding left
+ &psrlq ("mm2",14);
+ &psllq ("mm3",23);
+ &movq ("mm7","mm2"); # %mm7 is T1
+ &pxor ("mm7","mm3");
+ &psrlq ("mm2",4);
+ &psllq ("mm3",23);
+ &pxor ("mm7","mm2");
+ &pxor ("mm7","mm3");
+ &psrlq ("mm2",23);
+ &psllq ("mm3",4);
+ &pxor ("mm7","mm2");
+ &pxor ("mm7","mm3"); # T1=Sigma1_512(e)
+
+ &pxor ("mm4","mm5"); # f^=g
+ &pand ("mm4",$E); # f&=e
+ &pxor ("mm4","mm5"); # f^=g
+ &paddq ("mm7","mm4"); # T1+=Ch(e,f,g)
+
+ &movq ("mm2",&QWP($Boff,$W512)); # load b
+ &movq ("mm3",&QWP($Coff,$W512)); # load c
+ &movq ($E,&QWP($Doff,$W512)); # e = d
+ &movq (&QWP($Boff,$W512),$A); # b = a
+ &movq (&QWP($Coff,$W512),"mm2"); # c = b
+ &movq (&QWP($Doff,$W512),"mm3"); # d = c
+
+ &paddq ("mm7","mm6"); # T1+=h
+ &paddq ("mm7",&QWP(0,$K512,$kidx,8)); # T1+=K512[i]
+ &paddq ("mm7",&QWP(0,$W512,$widx,8)); # T1+=W512[i]
+ &paddq ($E,"mm7"); # e += T1
+
+ &movq ("mm4",$A); # %mm4 is sliding right
+ &movq ("mm5",$A); # %mm5 is sliding left
+ &psrlq ("mm4",28);
+ &psllq ("mm5",25);
+ &movq ("mm6","mm4"); # %mm6 is T2
+ &pxor ("mm6","mm5");
+ &psrlq ("mm4",6);
+ &psllq ("mm5",5);
+ &pxor ("mm6","mm4");
+ &pxor ("mm6","mm5");
+ &psrlq ("mm4",5);
+ &psllq ("mm5",6);
+ &pxor ("mm6","mm4");
+ &pxor ("mm6","mm5"); # T2=Sigma0_512(a)
+
+ &movq ("mm4","mm2"); # %mm4=b
+ &pand ("mm2",$A); # b&=a
+ &pand ("mm4","mm3"); # %mm4&=c
+ &pand ("mm3",$A); # c&=a
+ &pxor ("mm4","mm2"); # %mm4^=b&a
+ &pxor ("mm4","mm3"); # %mm4^=c&a
+ &paddq ("mm6","mm4"); # T2+=Maj(a,b,c)
+
+ &movq ($A,"mm7"); # a=T1
+ &paddq ($A,"mm6"); # a+=T2
+}
+
+$func="sha512_block_sse2";
+
+&function_begin_B($func);
+ if (0) {# Caller is expected to check if it's appropriate to
+ # call this routine. Below 3 lines are retained for
+ # debugging purposes...
+ &picmeup("eax","OPENSSL_ia32cap");
+ &bt (&DWP(0,"eax"),26);
+ &jnc ("SHA512_Transform");
+ }
+
+ &push ("ebp");
+ &mov ("ebp","esp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+
+ &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
+ &mov ($data,&DWP(12,"ebp")); # input data, 2nd arg
+ &call (&label("pic_point")); # make it PIC!
+&set_label("pic_point");
+ &blindpop($K512);
+ &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
+
+ $W512 = "esp"; # start using %esp as W512
+ &sub ($W512,$W512_SZ);
+ &and ($W512,-16); # ensure 128-bit alignment
+
+ # make private copy of A-H
+ # v assume the worst and stick to unaligned load
+ &movdqu ("xmm0",&QWP(0,$Widx));
+ &movdqu ("xmm1",&QWP(16,$Widx));
+ &movdqu ("xmm2",&QWP(32,$Widx));
+ &movdqu ("xmm3",&QWP(48,$Widx));
+
+&align(8);
+&set_label("_chunk_loop");
+
+ &movdqa (&QWP($Aoff,$W512),"xmm0"); # a,b
+ &movdqa (&QWP($Coff,$W512),"xmm1"); # c,d
+ &movdqa (&QWP($Eoff,$W512),"xmm2"); # e,f
+ &movdqa (&QWP($Goff,$W512),"xmm3"); # g,h
+
+ &xor ($Widx,$Widx);
+
+ &movdq2q($A,"xmm0"); # load a
+ &movdq2q($E,"xmm2"); # load e
+
+ # Why aren't loops unrolled? It makes sense to unroll if
+ # execution time for loop body is comparable with branch
+ # penalties and/or if whole data-set resides in register bank.
+ # Neither is case here... Well, it would be possible to
+ # eliminate few store operations, but it would hardly affect
+ # so to say stop-watch performance, as there is a lot of
+ # available memory slots to fill. It will only relieve some
+ # pressure off memory bus...
+
+&align(8);
+&set_label("_1st_loop"); # 0-15
+ # flip input stream byte order...
+ &mov ("eax",&DWP(0,$data,$Widx,8));
+ &mov ("ebx",&DWP(4,$data,$Widx,8));
+ &bswap ("eax");
+ &bswap ("ebx");
+ &mov (&DWP(0,$W512,$Widx,8),"ebx"); # W512[i]
+ &mov (&DWP(4,$W512,$Widx,8),"eax");
+ &mov (&DWP(128+0,$W512,$Widx,8),"ebx"); # copy of W512[i]
+ &mov (&DWP(128+4,$W512,$Widx,8),"eax");
+
+ &SHA2_ROUND($Widx,$Widx); &inc($Widx);
+
+&cmp ($Widx,16)
+&jl (&label("_1st_loop"));
+
+ $Kidx = "ebx"; # start using %ebx as Kidx
+ &mov ($Kidx,$Widx);
+
+&align(8);
+&set_label("_2nd_loop"); # 16-79
+ &and($Widx,0xf);
+
+ # 128-bit fragment! I update W512[i] and W512[i+1] in
+ # parallel:-) Note that I refer to W512[(i&0xf)+N] and not to
+ # W512[(i+N)&0xf]! This is exactly what I maintain the second
+ # copy of W512[16] for...
+ &movdqu ("xmm0",&QWP(8*1,$W512,$Widx,8)); # s0=W512[i+1]
+ &movdqa ("xmm2","xmm0"); # %xmm2 is sliding right
+ &movdqa ("xmm3","xmm0"); # %xmm3 is sliding left
+ &psrlq ("xmm2",1);
+ &psllq ("xmm3",56);
+ &movdqa ("xmm0","xmm2");
+ &pxor ("xmm0","xmm3");
+ &psrlq ("xmm2",6);
+ &psllq ("xmm3",7);
+ &pxor ("xmm0","xmm2");
+ &pxor ("xmm0","xmm3");
+ &psrlq ("xmm2",1);
+ &pxor ("xmm0","xmm2"); # s0 = sigma0_512(s0);
+
+ &movdqa ("xmm1",&QWP(8*14,$W512,$Widx,8)); # s1=W512[i+14]
+ &movdqa ("xmm4","xmm1"); # %xmm4 is sliding right
+ &movdqa ("xmm5","xmm1"); # %xmm5 is sliding left
+ &psrlq ("xmm4",6);
+ &psllq ("xmm5",3);
+ &movdqa ("xmm1","xmm4");
+ &pxor ("xmm1","xmm5");
+ &psrlq ("xmm4",13);
+ &psllq ("xmm5",42);
+ &pxor ("xmm1","xmm4");
+ &pxor ("xmm1","xmm5");
+ &psrlq ("xmm4",42);
+ &pxor ("xmm1","xmm4"); # s1 = sigma1_512(s1);
+
+ # + have to explictly load W512[i+9] as it's not 128-bit
+ # v aligned and paddq would throw an exception...
+ &movdqu ("xmm6",&QWP(8*9,$W512,$Widx,8));
+ &paddq ("xmm0","xmm1"); # s0 += s1
+ &paddq ("xmm0","xmm6"); # s0 += W512[i+9]
+ &paddq ("xmm0",&QWP(0,$W512,$Widx,8)); # s0 += W512[i]
+
+ &movdqa (&QWP(0,$W512,$Widx,8),"xmm0"); # W512[i] = s0
+ &movdqa (&QWP(16*8,$W512,$Widx,8),"xmm0"); # copy of W512[i]
+
+ # as the above fragment was 128-bit, we "owe" 2 rounds...
+ &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
+ &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
+
+&cmp ($Kidx,80);
+&jl (&label("_2nd_loop"));
+
+ # update A-H state
+ &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
+ &movq (&QWP($Aoff,$W512),$A); # write out a
+ &movq (&QWP($Eoff,$W512),$E); # write out e
+ &movdqu ("xmm0",&QWP(0,$Widx));
+ &movdqu ("xmm1",&QWP(16,$Widx));
+ &movdqu ("xmm2",&QWP(32,$Widx));
+ &movdqu ("xmm3",&QWP(48,$Widx));
+ &paddq ("xmm0",&QWP($Aoff,$W512)); # 128-bit additions...
+ &paddq ("xmm1",&QWP($Coff,$W512));
+ &paddq ("xmm2",&QWP($Eoff,$W512));
+ &paddq ("xmm3",&QWP($Goff,$W512));
+ &movdqu (&QWP(0,$Widx),"xmm0");
+ &movdqu (&QWP(16,$Widx),"xmm1");
+ &movdqu (&QWP(32,$Widx),"xmm2");
+ &movdqu (&QWP(48,$Widx),"xmm3");
+
+&add ($data,16*8); # advance input data pointer
+&dec (&DWP(16,"ebp")); # decrement 3rd arg
+&jnz (&label("_chunk_loop"));
+
+ # epilogue
+ &emms (); # required for at least ELF and Win32 ABIs
+ &mov ("edi",&DWP(-12,"ebp"));
+ &mov ("esi",&DWP(-8,"ebp"));
+ &mov ("ebx",&DWP(-4,"ebp"));
+ &leave ();
+&ret ();
+
+&align(16);
+&set_label("K512"); # Yes! I keep it in the code segment!
+ &data_word(0xd728ae22,0x428a2f98); # u64
+ &data_word(0x23ef65cd,0x71374491); # u64
+ &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
+ &data_word(0x8189dbbc,0xe9b5dba5); # u64
+ &data_word(0xf348b538,0x3956c25b); # u64
+ &data_word(0xb605d019,0x59f111f1); # u64
+ &data_word(0xaf194f9b,0x923f82a4); # u64
+ &data_word(0xda6d8118,0xab1c5ed5); # u64
+ &data_word(0xa3030242,0xd807aa98); # u64
+ &data_word(0x45706fbe,0x12835b01); # u64
+ &data_word(0x4ee4b28c,0x243185be); # u64
+ &data_word(0xd5ffb4e2,0x550c7dc3); # u64
+ &data_word(0xf27b896f,0x72be5d74); # u64
+ &data_word(0x3b1696b1,0x80deb1fe); # u64
+ &data_word(0x25c71235,0x9bdc06a7); # u64
+ &data_word(0xcf692694,0xc19bf174); # u64
+ &data_word(0x9ef14ad2,0xe49b69c1); # u64
+ &data_word(0x384f25e3,0xefbe4786); # u64
+ &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
+ &data_word(0x77ac9c65,0x240ca1cc); # u64
+ &data_word(0x592b0275,0x2de92c6f); # u64
+ &data_word(0x6ea6e483,0x4a7484aa); # u64
+ &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
+ &data_word(0x831153b5,0x76f988da); # u64
+ &data_word(0xee66dfab,0x983e5152); # u64
+ &data_word(0x2db43210,0xa831c66d); # u64
+ &data_word(0x98fb213f,0xb00327c8); # u64
+ &data_word(0xbeef0ee4,0xbf597fc7); # u64
+ &data_word(0x3da88fc2,0xc6e00bf3); # u64
+ &data_word(0x930aa725,0xd5a79147); # u64
+ &data_word(0xe003826f,0x06ca6351); # u64
+ &data_word(0x0a0e6e70,0x14292967); # u64
+ &data_word(0x46d22ffc,0x27b70a85); # u64
+ &data_word(0x5c26c926,0x2e1b2138); # u64
+ &data_word(0x5ac42aed,0x4d2c6dfc); # u64
+ &data_word(0x9d95b3df,0x53380d13); # u64
+ &data_word(0x8baf63de,0x650a7354); # u64
+ &data_word(0x3c77b2a8,0x766a0abb); # u64
+ &data_word(0x47edaee6,0x81c2c92e); # u64
+ &data_word(0x1482353b,0x92722c85); # u64
+ &data_word(0x4cf10364,0xa2bfe8a1); # u64
+ &data_word(0xbc423001,0xa81a664b); # u64
+ &data_word(0xd0f89791,0xc24b8b70); # u64
+ &data_word(0x0654be30,0xc76c51a3); # u64
+ &data_word(0xd6ef5218,0xd192e819); # u64
+ &data_word(0x5565a910,0xd6990624); # u64
+ &data_word(0x5771202a,0xf40e3585); # u64
+ &data_word(0x32bbd1b8,0x106aa070); # u64
+ &data_word(0xb8d2d0c8,0x19a4c116); # u64
+ &data_word(0x5141ab53,0x1e376c08); # u64
+ &data_word(0xdf8eeb99,0x2748774c); # u64
+ &data_word(0xe19b48a8,0x34b0bcb5); # u64
+ &data_word(0xc5c95a63,0x391c0cb3); # u64
+ &data_word(0xe3418acb,0x4ed8aa4a); # u64
+ &data_word(0x7763e373,0x5b9cca4f); # u64
+ &data_word(0xd6b2b8a3,0x682e6ff3); # u64
+ &data_word(0x5defb2fc,0x748f82ee); # u64
+ &data_word(0x43172f60,0x78a5636f); # u64
+ &data_word(0xa1f0ab72,0x84c87814); # u64
+ &data_word(0x1a6439ec,0x8cc70208); # u64
+ &data_word(0x23631e28,0x90befffa); # u64
+ &data_word(0xde82bde9,0xa4506ceb); # u64
+ &data_word(0xb2c67915,0xbef9a3f7); # u64
+ &data_word(0xe372532b,0xc67178f2); # u64
+ &data_word(0xea26619c,0xca273ece); # u64
+ &data_word(0x21c0c207,0xd186b8c7); # u64
+ &data_word(0xcde0eb1e,0xeada7dd6); # u64
+ &data_word(0xee6ed178,0xf57d4f7f); # u64
+ &data_word(0x72176fba,0x06f067aa); # u64
+ &data_word(0xa2c898a6,0x0a637dc5); # u64
+ &data_word(0xbef90dae,0x113f9804); # u64
+ &data_word(0x131c471b,0x1b710b35); # u64
+ &data_word(0x23047d84,0x28db77f5); # u64
+ &data_word(0x40c72493,0x32caab7b); # u64
+ &data_word(0x15c9bebc,0x3c9ebe0a); # u64
+ &data_word(0x9c100d4c,0x431d67c4); # u64
+ &data_word(0xcb3e42b6,0x4cc5d4be); # u64
+ &data_word(0xfc657e2a,0x597f299c); # u64
+ &data_word(0x3ad6faec,0x5fcb6fab); # u64
+ &data_word(0x4a475817,0x6c44198c); # u64
+
+&function_end_B($func);
+
+&asm_finish();
diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c
new file mode 100644
index 0000000000..2f4078f444
--- /dev/null
+++ b/crypto/sha/sha256.c
@@ -0,0 +1,309 @@
+/* crypto/sha/sha256.c */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved
+ * according to the OpenSSL license [found in ../../LICENSE].
+ * ====================================================================
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/opensslconf.h>
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/opensslv.h>
+
+const char *SHA256_version="SHA-256" OPENSSL_VERSION_PTEXT;
+
+int SHA224_Init (SHA256_CTX *c)
+ {
+ c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
+ c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
+ c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
+ c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
+ c->Nl=0; c->Nh=0;
+ c->num=0; c->md_len=SHA224_DIGEST_LENGTH;
+ return 1;
+ }
+
+int SHA256_Init (SHA256_CTX *c)
+ {
+ c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
+ c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
+ c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
+ c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
+ c->Nl=0; c->Nh=0;
+ c->num=0; c->md_len=SHA256_DIGEST_LENGTH;
+ return 1;
+ }
+
+unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
+ {
+ SHA256_CTX c;
+ static unsigned char m[SHA224_DIGEST_LENGTH];
+
+ if (md == NULL) md=m;
+ SHA224_Init(&c);
+ SHA256_Update(&c,d,n);
+ SHA256_Final(md,&c);
+ OPENSSL_cleanse(&c,sizeof(c));
+ return(md);
+ }
+
+unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
+ {
+ SHA256_CTX c;
+ static unsigned char m[SHA256_DIGEST_LENGTH];
+
+ if (md == NULL) md=m;
+ SHA256_Init(&c);
+ SHA256_Update(&c,d,n);
+ SHA256_Final(md,&c);
+ OPENSSL_cleanse(&c,sizeof(c));
+ return(md);
+ }
+
+int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
+{ return SHA256_Update (c,data,len); }
+int SHA224_Final (unsigned char *md, SHA256_CTX *c)
+{ return SHA256_Final (md,c); }
+
+#ifndef SHA_LONG_LOG2
+#define SHA_LONG_LOG2 2 /* default to 32 bits */
+#endif
+
+#define DATA_ORDER_IS_BIG_ENDIAN
+
+#define HASH_LONG SHA_LONG
+#define HASH_LONG_LOG2 SHA_LONG_LOG2
+#define HASH_CTX SHA256_CTX
+#define HASH_CBLOCK SHA_CBLOCK
+#define HASH_LBLOCK SHA_LBLOCK
+/*
+ * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
+ * default: case below covers for it. It's not clear however if it's
+ * permitted to truncate to amount of bytes not divisible by 4. I bet not,
+ * but if it is, then default: case shall be extended. For reference.
+ * Idea behind separate cases for pre-defined lenghts is to let the
+ * compiler decide if it's appropriate to unroll small loops.
+ */
+#define HASH_MAKE_STRING(c,s) do { \
+ unsigned long ll; \
+ unsigned int n; \
+ switch ((c)->md_len) \
+ { case SHA224_DIGEST_LENGTH: \
+ for (n=0;n<SHA224_DIGEST_LENGTH/4;n++) \
+ { ll=(c)->h[n]; HOST_l2c(ll,(s)); } \
+ break; \
+ case SHA256_DIGEST_LENGTH: \
+ for (n=0;n<SHA256_DIGEST_LENGTH/4;n++) \
+ { ll=(c)->h[n]; HOST_l2c(ll,(s)); } \
+ break; \
+ default: \
+ if ((c)->md_len > SHA256_DIGEST_LENGTH) \
+ return 0; \
+ for (n=0;n<(c)->md_len/4;n++) \
+ { ll=(c)->h[n]; HOST_l2c(ll,(s)); } \
+ break; \
+ } \
+ } while (0)
+
+#define HASH_UPDATE SHA256_Update
+#define HASH_TRANSFORM SHA256_Transform
+#define HASH_FINAL SHA256_Final
+#define HASH_BLOCK_HOST_ORDER sha256_block_host_order
+#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
+void sha256_block_host_order (SHA256_CTX *ctx, const void *in, size_t num);
+void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
+
+#include "md32_common.h"
+
+static const SHA_LONG K256[64] = {
+ 0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
+ 0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
+ 0xd807aa98UL,0x12835b01UL,0x243185beUL,0x550c7dc3UL,
+ 0x72be5d74UL,0x80deb1feUL,0x9bdc06a7UL,0xc19bf174UL,
+ 0xe49b69c1UL,0xefbe4786UL,0x0fc19dc6UL,0x240ca1ccUL,
+ 0x2de92c6fUL,0x4a7484aaUL,0x5cb0a9dcUL,0x76f988daUL,
+ 0x983e5152UL,0xa831c66dUL,0xb00327c8UL,0xbf597fc7UL,
+ 0xc6e00bf3UL,0xd5a79147UL,0x06ca6351UL,0x14292967UL,
+ 0x27b70a85UL,0x2e1b2138UL,0x4d2c6dfcUL,0x53380d13UL,
+ 0x650a7354UL,0x766a0abbUL,0x81c2c92eUL,0x92722c85UL,
+ 0xa2bfe8a1UL,0xa81a664bUL,0xc24b8b70UL,0xc76c51a3UL,
+ 0xd192e819UL,0xd6990624UL,0xf40e3585UL,0x106aa070UL,
+ 0x19a4c116UL,0x1e376c08UL,0x2748774cUL,0x34b0bcb5UL,
+ 0x391c0cb3UL,0x4ed8aa4aUL,0x5b9cca4fUL,0x682e6ff3UL,
+ 0x748f82eeUL,0x78a5636fUL,0x84c87814UL,0x8cc70208UL,
+ 0x90befffaUL,0xa4506cebUL,0xbef9a3f7UL,0xc67178f2UL };
+
+/*
+ * FIPS specification refers to right rotations, while our ROTATE macro
+ * is left one. This is why you might notice that rotation coefficients
+ * differ from those observed in FIPS document by 32-N...
+ */
+#define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
+#define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
+#define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
+#define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
+
+#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#ifdef OPENSSL_SMALL_FOOTPRINT
+
+static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
+ {
+ unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
+ SHA_LONG X[16];
+ int i;
+ const unsigned char *data=in;
+
+ while (num--) {
+
+ a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
+ e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
+
+ if (host)
+ {
+ const SHA_LONG *W=(const SHA_LONG *)data;
+
+ for (i=0;i<16;i++)
+ {
+ T1 = X[i] = W[i];
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
+ T2 = Sigma0(a) + Maj(a,b,c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+ }
+ else
+ {
+ SHA_LONG l;
+
+ for (i=0;i<16;i++)
+ {
+ HOST_c2l(data,l); T1 = X[i] = l;
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
+ T2 = Sigma0(a) + Maj(a,b,c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+ }
+
+ for (;i<64;i++)
+ {
+ s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
+ s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
+
+ T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
+ T2 = Sigma0(a) + Maj(a,b,c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
+ ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
+
+ data += SHA256_CBLOCK;
+ }
+}
+
+#else
+
+#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
+ h = Sigma0(a) + Maj(a,b,c); \
+ d += T1; h += T1; } while (0)
+
+#define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
+ s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
+ s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
+ T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
+ ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
+
+static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
+ {
+ unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
+ SHA_LONG X[16];
+ int i;
+ const unsigned char *data=in;
+
+ while (num--) {
+
+ a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
+ e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
+
+ if (host)
+ {
+ const SHA_LONG *W=(const SHA_LONG *)data;
+
+ T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
+ T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
+ T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
+ T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
+ T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
+ T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
+ T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
+ T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
+ T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
+ T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
+ T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
+ T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
+ T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
+ T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
+ T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
+ T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
+ }
+ else
+ {
+ SHA_LONG l;
+
+ HOST_c2l(data,l); T1 = X[0] = l; ROUND_00_15(0,a,b,c,d,e,f,g,h);
+ HOST_c2l(data,l); T1 = X[1] = l; ROUND_00_15(1,h,a,b,c,d,e,f,g);
+ HOST_c2l(data,l); T1 = X[2] = l; ROUND_00_15(2,g,h,a,b,c,d,e,f);
+ HOST_c2l(data,l); T1 = X[3] = l; ROUND_00_15(3,f,g,h,a,b,c,d,e);
+ HOST_c2l(data,l); T1 = X[4] = l; ROUND_00_15(4,e,f,g,h,a,b,c,d);
+ HOST_c2l(data,l); T1 = X[5] = l; ROUND_00_15(5,d,e,f,g,h,a,b,c);
+ HOST_c2l(data,l); T1 = X[6] = l; ROUND_00_15(6,c,d,e,f,g,h,a,b);
+ HOST_c2l(data,l); T1 = X[7] = l; ROUND_00_15(7,b,c,d,e,f,g,h,a);
+ HOST_c2l(data,l); T1 = X[8] = l; ROUND_00_15(8,a,b,c,d,e,f,g,h);
+ HOST_c2l(data,l); T1 = X[9] = l; ROUND_00_15(9,h,a,b,c,d,e,f,g);
+ HOST_c2l(data,l); T1 = X[10] = l; ROUND_00_15(10,g,h,a,b,c,d,e,f);
+ HOST_c2l(data,l); T1 = X[11] = l; ROUND_00_15(11,f,g,h,a,b,c,d,e);
+ HOST_c2l(data,l); T1 = X[12] = l; ROUND_00_15(12,e,f,g,h,a,b,c,d);
+ HOST_c2l(data,l); T1 = X[13] = l; ROUND_00_15(13,d,e,f,g,h,a,b,c);
+ HOST_c2l(data,l); T1 = X[14] = l; ROUND_00_15(14,c,d,e,f,g,h,a,b);
+ HOST_c2l(data,l); T1 = X[15] = l; ROUND_00_15(15,b,c,d,e,f,g,h,a);
+ }
+
+ for (i=16;i<64;i+=8)
+ {
+ ROUND_16_63(i+0,a,b,c,d,e,f,g,h,X);
+ ROUND_16_63(i+1,h,a,b,c,d,e,f,g,X);
+ ROUND_16_63(i+2,g,h,a,b,c,d,e,f,X);
+ ROUND_16_63(i+3,f,g,h,a,b,c,d,e,X);
+ ROUND_16_63(i+4,e,f,g,h,a,b,c,d,X);
+ ROUND_16_63(i+5,d,e,f,g,h,a,b,c,X);
+ ROUND_16_63(i+6,c,d,e,f,g,h,a,b,X);
+ ROUND_16_63(i+7,b,c,d,e,f,g,h,a,X);
+ }
+
+ ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
+ ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
+
+ data += SHA256_CBLOCK;
+ }
+ }
+
+#endif
+
+/*
+ * Idea is to trade couple of cycles for some space. On IA-32 we save
+ * about 4K in "big footprint" case. In "small footprint" case any gain
+ * is appreciated:-)
+ */
+void HASH_BLOCK_HOST_ORDER (SHA256_CTX *ctx, const void *in, size_t num)
+{ sha256_block (ctx,in,num,1); }
+
+void HASH_BLOCK_DATA_ORDER (SHA256_CTX *ctx, const void *in, size_t num)
+{ sha256_block (ctx,in,num,0); }
diff --git a/crypto/sha/sha256t.c b/crypto/sha/sha256t.c
new file mode 100644
index 0000000000..e211c9c24f
--- /dev/null
+++ b/crypto/sha/sha256t.c
@@ -0,0 +1,130 @@
+/* crypto/sha/sha256t.c */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ * ====================================================================
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <openssl/sha.h>
+
+unsigned char app_b1[SHA256_DIGEST_LENGTH] = {
+ 0xba,0x78,0x16,0xbf,0x8f,0x01,0xcf,0xea,
+ 0x41,0x41,0x40,0xde,0x5d,0xae,0x22,0x23,
+ 0xb0,0x03,0x61,0xa3,0x96,0x17,0x7a,0x9c,
+ 0xb4,0x10,0xff,0x61,0xf2,0x00,0x15,0xad };
+
+unsigned char app_b2[SHA256_DIGEST_LENGTH] = {
+ 0x24,0x8d,0x6a,0x61,0xd2,0x06,0x38,0xb8,
+ 0xe5,0xc0,0x26,0x93,0x0c,0x3e,0x60,0x39,
+ 0xa3,0x3c,0xe4,0x59,0x64,0xff,0x21,0x67,
+ 0xf6,0xec,0xed,0xd4,0x19,0xdb,0x06,0xc1 };
+
+unsigned char app_b3[SHA256_DIGEST_LENGTH] = {
+ 0xcd,0xc7,0x6e,0x5c,0x99,0x14,0xfb,0x92,
+ 0x81,0xa1,0xc7,0xe2,0x84,0xd7,0x3e,0x67,
+ 0xf1,0x80,0x9a,0x48,0xa4,0x97,0x20,0x0e,
+ 0x04,0x6d,0x39,0xcc,0xc7,0x11,0x2c,0xd0 };
+
+unsigned char addenum_1[SHA224_DIGEST_LENGTH] = {
+ 0x23,0x09,0x7d,0x22,0x34,0x05,0xd8,0x22,
+ 0x86,0x42,0xa4,0x77,0xbd,0xa2,0x55,0xb3,
+ 0x2a,0xad,0xbc,0xe4,0xbd,0xa0,0xb3,0xf7,
+ 0xe3,0x6c,0x9d,0xa7 };
+
+unsigned char addenum_2[SHA224_DIGEST_LENGTH] = {
+ 0x75,0x38,0x8b,0x16,0x51,0x27,0x76,0xcc,
+ 0x5d,0xba,0x5d,0xa1,0xfd,0x89,0x01,0x50,
+ 0xb0,0xc6,0x45,0x5c,0xb4,0xf5,0x8b,0x19,
+ 0x52,0x52,0x25,0x25 };
+
+unsigned char addenum_3[SHA224_DIGEST_LENGTH] = {
+ 0x20,0x79,0x46,0x55,0x98,0x0c,0x91,0xd8,
+ 0xbb,0xb4,0xc1,0xea,0x97,0x61,0x8a,0x4b,
+ 0xf0,0x3f,0x42,0x58,0x19,0x48,0xb2,0xee,
+ 0x4e,0xe7,0xad,0x67 };
+
+int main ()
+{ unsigned char md[SHA256_DIGEST_LENGTH];
+ int i;
+ SHA256_CTX ctx;
+
+ fprintf(stdout,"Testing SHA-256 ");
+
+ SHA256((unsigned char *)"abc",3,md);
+ if (memcmp(md,app_b1,sizeof(app_b1)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 1 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA256((unsigned char *)"abcdbcde""cdefdefg""efghfghi""ghijhijk"
+ "ijkljklm""klmnlmno""mnopnopq",56,md);
+ if (memcmp(md,app_b2,sizeof(app_b2)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 2 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA256_Init(&ctx);
+ for (i=0;i<1000000;i+=64)
+ SHA256_Update(&ctx, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
+ (1000000-i)<64?1000000-i:64);
+ SHA256_Final(md,&ctx);
+
+ if (memcmp(md,app_b3,sizeof(app_b3)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 3 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ fprintf(stdout," passed.\n"); fflush(stdout);
+
+ fprintf(stdout,"Testing SHA-224 ");
+
+ SHA224((unsigned char *)"abc",3,md);
+ if (memcmp(md,addenum_1,sizeof(addenum_1)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 1 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA224((unsigned char *)"abcdbcde""cdefdefg""efghfghi""ghijhijk"
+ "ijkljklm""klmnlmno""mnopnopq",56,md);
+ if (memcmp(md,addenum_2,sizeof(addenum_2)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 2 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA224_Init(&ctx);
+ for (i=0;i<1000000;i+=64)
+ SHA256_Update(&ctx, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
+ (1000000-i)<64?1000000-i:64);
+ SHA256_Final(md,&ctx);
+
+ if (memcmp(md,addenum_3,sizeof(addenum_3)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 3 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ fprintf(stdout," passed.\n"); fflush(stdout);
+
+ return 0;
+}
diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
new file mode 100644
index 0000000000..dc1047d59e
--- /dev/null
+++ b/crypto/sha/sha512.c
@@ -0,0 +1,478 @@
+/* crypto/sha/sha512.c */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved
+ * according to the OpenSSL license [found in ../../LICENSE].
+ * ====================================================================
+ */
+/*
+ * IMPLEMENTATION NOTES.
+ *
+ * As you might have noticed 32-bit hash algorithms:
+ *
+ * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
+ * - optimized versions implement two transform functions: one operating
+ * on [aligned] data in host byte order and one - on data in input
+ * stream byte order;
+ * - share common byte-order neutral collector and padding function
+ * implementations, ../md32_common.h;
+ *
+ * Neither of the above applies to this SHA-512 implementations. Reasons
+ * [in reverse order] are:
+ *
+ * - it's the only 64-bit hash algorithm for the moment of this writing,
+ * there is no need for common collector/padding implementation [yet];
+ * - by supporting only one transform function [which operates on
+ * *aligned* data in input stream byte order, big-endian in this case]
+ * we minimize burden of maintenance in two ways: a) collector/padding
+ * function is simpler; b) only one transform function to stare at;
+ * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
+ * apply a number of optimizations to mitigate potential performance
+ * penalties caused by previous design decision;
+ *
+ * Caveat lector.
+ *
+ * Implementation relies on the fact that "long long" is 64-bit on
+ * both 32- and 64-bit platforms. If some compiler vendor comes up
+ * with 128-bit long long, adjustment to sha.h would be required.
+ * As this implementation relies on 64-bit integer type, it's totally
+ * inappropriate for platforms which don't support it, most notably
+ * 16-bit platforms.
+ * <appro@fy.chalmers.se>
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/opensslconf.h>
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/opensslv.h>
+
+const char *SHA512_version="SHA-512" OPENSSL_VERSION_PTEXT;
+
+#if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386) || defined(__x86_64)
+#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#endif
+
+int SHA384_Init (SHA512_CTX *c)
+ {
+ c->h[0]=U64(0xcbbb9d5dc1059ed8);
+ c->h[1]=U64(0x629a292a367cd507);
+ c->h[2]=U64(0x9159015a3070dd17);
+ c->h[3]=U64(0x152fecd8f70e5939);
+ c->h[4]=U64(0x67332667ffc00b31);
+ c->h[5]=U64(0x8eb44a8768581511);
+ c->h[6]=U64(0xdb0c2e0d64f98fa7);
+ c->h[7]=U64(0x47b5481dbefa4fa4);
+ c->Nl=0; c->Nh=0;
+ c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
+ return 1;
+ }
+
+int SHA512_Init (SHA512_CTX *c)
+ {
+ c->h[0]=U64(0x6a09e667f3bcc908);
+ c->h[1]=U64(0xbb67ae8584caa73b);
+ c->h[2]=U64(0x3c6ef372fe94f82b);
+ c->h[3]=U64(0xa54ff53a5f1d36f1);
+ c->h[4]=U64(0x510e527fade682d1);
+ c->h[5]=U64(0x9b05688c2b3e6c1f);
+ c->h[6]=U64(0x1f83d9abfb41bd6b);
+ c->h[7]=U64(0x5be0cd19137e2179);
+ c->Nl=0; c->Nh=0;
+ c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
+ return 1;
+ }
+
+static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num);
+
+int SHA512_Final (unsigned char *md, SHA512_CTX *c)
+ {
+ unsigned char *p=(unsigned char *)c->u.p;
+ size_t n=c->num;
+
+ p[n]=0x80; /* There always is a room for one */
+ n++;
+ if (n > (sizeof(c->u)-16))
+ memset (p+n,0,sizeof(c->u)-n), n=0,
+ sha512_block (c,p,1);
+
+ memset (p+n,0,sizeof(c->u)-16-n);
+#ifdef B_ENDIAN
+ c->u.d[SHA_LBLOCK-2] = c->Nh;
+ c->u.d[SHA_LBLOCK-1] = c->Nl;
+#else
+ p[sizeof(c->u)-1] = (c->Nl)&0xFF;
+ p[sizeof(c->u)-2] = (c->Nl>>8)&0xFF;
+ p[sizeof(c->u)-3] = (c->Nl>>16)&0xFF;
+ p[sizeof(c->u)-4] = (c->Nl>>24)&0xFF;
+ p[sizeof(c->u)-5] = (c->Nl>>32)&0xFF;
+ p[sizeof(c->u)-6] = (c->Nl>>40)&0xFF;
+ p[sizeof(c->u)-7] = (c->Nl>>48)&0xFF;
+ p[sizeof(c->u)-8] = (c->Nl>>56)&0xFF;
+ p[sizeof(c->u)-9] = (c->Nh)&0xFF;
+ p[sizeof(c->u)-10] = (c->Nh>>8)&0xFF;
+ p[sizeof(c->u)-11] = (c->Nh>>16)&0xFF;
+ p[sizeof(c->u)-12] = (c->Nh>>24)&0xFF;
+ p[sizeof(c->u)-13] = (c->Nh>>32)&0xFF;
+ p[sizeof(c->u)-14] = (c->Nh>>40)&0xFF;
+ p[sizeof(c->u)-15] = (c->Nh>>48)&0xFF;
+ p[sizeof(c->u)-16] = (c->Nh>>56)&0xFF;
+#endif
+
+ sha512_block (c,p,1);
+
+ if (md==0) return 0;
+
+ switch (c->md_len)
+ {
+ /* Let compiler decide if it's appropriate to unroll... */
+ case SHA384_DIGEST_LENGTH:
+ for (n=0;n<SHA384_DIGEST_LENGTH/8;n++)
+ {
+ SHA_LONG64 t = c->h[n];
+
+ *(md++) = (t>>56)&0xFF; *(md++) = (t>>48)&0xFF;
+ *(md++) = (t>>40)&0xFF; *(md++) = (t>>32)&0xFF;
+ *(md++) = (t>>24)&0xFF; *(md++) = (t>>16)&0xFF;
+ *(md++) = (t>>8)&0xFF; *(md++) = (t)&0xFF;
+ }
+ break;
+ case SHA512_DIGEST_LENGTH:
+ for (n=0;n<SHA512_DIGEST_LENGTH/8;n++)
+ {
+ SHA_LONG64 t = c->h[n];
+
+ *(md++) = (t>>56)&0xFF; *(md++) = (t>>48)&0xFF;
+ *(md++) = (t>>40)&0xFF; *(md++) = (t>>32)&0xFF;
+ *(md++) = (t>>24)&0xFF; *(md++) = (t>>16)&0xFF;
+ *(md++) = (t>>8)&0xFF; *(md++) = (t)&0xFF;
+ }
+ break;
+ /* ... as well as make sure md_len is not abused. */
+ default: return 0;
+ }
+
+ return 1;
+ }
+
+int SHA384_Final (unsigned char *md,SHA512_CTX *c)
+{ return SHA512_Final (md,c); }
+
+int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
+ {
+ SHA_LONG64 l;
+ unsigned char *p=c->u.p;
+ const unsigned char *data=(const unsigned char *)_data;
+
+ if (len==0) return 1;
+
+ l = (c->Nl+(((SHA_LONG64)len)<<3))&U64(0xffffffffffffffff);
+ if (l < c->Nl) c->Nh++;
+ if (sizeof(len)>=8) c->Nh+=(((SHA_LONG64)len)>>61);
+ c->Nl=l;
+
+ if (c->num != 0)
+ {
+ size_t n = sizeof(c->u) - c->num;
+
+ if (len < n)
+ {
+ memcpy (p+c->num,data,len), c->num += len;
+ return 1;
+ }
+ else {
+ memcpy (p+c->num,data,n), c->num = 0;
+ len-=n, data+=n;
+ sha512_block (c,p,1);
+ }
+ }
+
+ if (len >= sizeof(c->u))
+ {
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+ if ((size_t)data%sizeof(c->u.d[0]) != 0)
+ while (len >= sizeof(c->u))
+ memcpy (p,data,sizeof(c->u)),
+ sha512_block (c,p,1),
+ len -= sizeof(c->u),
+ data += sizeof(c->u);
+ else
+#endif
+ sha512_block (c,data,len/sizeof(c->u)),
+ data += len,
+ len %= sizeof(c->u),
+ data -= len;
+ }
+
+ if (len != 0) memcpy (p,data,len), c->num = (int)len;
+
+ return 1;
+ }
+
+int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
+{ return SHA512_Update (c,data,len); }
+
+void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
+{ sha512_block (c,data,1); }
+
+unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
+ {
+ SHA512_CTX c;
+ static unsigned char m[SHA384_DIGEST_LENGTH];
+
+ if (md == NULL) md=m;
+ SHA384_Init(&c);
+ SHA512_Update(&c,d,n);
+ SHA512_Final(md,&c);
+ OPENSSL_cleanse(&c,sizeof(c));
+ return(md);
+ }
+
+unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
+ {
+ SHA512_CTX c;
+ static unsigned char m[SHA512_DIGEST_LENGTH];
+
+ if (md == NULL) md=m;
+ SHA512_Init(&c);
+ SHA512_Update(&c,d,n);
+ SHA512_Final(md,&c);
+ OPENSSL_cleanse(&c,sizeof(c));
+ return(md);
+ }
+
+static const SHA_LONG64 K512[80] = {
+ U64(0x428a2f98d728ae22),U64(0x7137449123ef65cd),
+ U64(0xb5c0fbcfec4d3b2f),U64(0xe9b5dba58189dbbc),
+ U64(0x3956c25bf348b538),U64(0x59f111f1b605d019),
+ U64(0x923f82a4af194f9b),U64(0xab1c5ed5da6d8118),
+ U64(0xd807aa98a3030242),U64(0x12835b0145706fbe),
+ U64(0x243185be4ee4b28c),U64(0x550c7dc3d5ffb4e2),
+ U64(0x72be5d74f27b896f),U64(0x80deb1fe3b1696b1),
+ U64(0x9bdc06a725c71235),U64(0xc19bf174cf692694),
+ U64(0xe49b69c19ef14ad2),U64(0xefbe4786384f25e3),
+ U64(0x0fc19dc68b8cd5b5),U64(0x240ca1cc77ac9c65),
+ U64(0x2de92c6f592b0275),U64(0x4a7484aa6ea6e483),
+ U64(0x5cb0a9dcbd41fbd4),U64(0x76f988da831153b5),
+ U64(0x983e5152ee66dfab),U64(0xa831c66d2db43210),
+ U64(0xb00327c898fb213f),U64(0xbf597fc7beef0ee4),
+ U64(0xc6e00bf33da88fc2),U64(0xd5a79147930aa725),
+ U64(0x06ca6351e003826f),U64(0x142929670a0e6e70),
+ U64(0x27b70a8546d22ffc),U64(0x2e1b21385c26c926),
+ U64(0x4d2c6dfc5ac42aed),U64(0x53380d139d95b3df),
+ U64(0x650a73548baf63de),U64(0x766a0abb3c77b2a8),
+ U64(0x81c2c92e47edaee6),U64(0x92722c851482353b),
+ U64(0xa2bfe8a14cf10364),U64(0xa81a664bbc423001),
+ U64(0xc24b8b70d0f89791),U64(0xc76c51a30654be30),
+ U64(0xd192e819d6ef5218),U64(0xd69906245565a910),
+ U64(0xf40e35855771202a),U64(0x106aa07032bbd1b8),
+ U64(0x19a4c116b8d2d0c8),U64(0x1e376c085141ab53),
+ U64(0x2748774cdf8eeb99),U64(0x34b0bcb5e19b48a8),
+ U64(0x391c0cb3c5c95a63),U64(0x4ed8aa4ae3418acb),
+ U64(0x5b9cca4f7763e373),U64(0x682e6ff3d6b2b8a3),
+ U64(0x748f82ee5defb2fc),U64(0x78a5636f43172f60),
+ U64(0x84c87814a1f0ab72),U64(0x8cc702081a6439ec),
+ U64(0x90befffa23631e28),U64(0xa4506cebde82bde9),
+ U64(0xbef9a3f7b2c67915),U64(0xc67178f2e372532b),
+ U64(0xca273eceea26619c),U64(0xd186b8c721c0c207),
+ U64(0xeada7dd6cde0eb1e),U64(0xf57d4f7fee6ed178),
+ U64(0x06f067aa72176fba),U64(0x0a637dc5a2c898a6),
+ U64(0x113f9804bef90dae),U64(0x1b710b35131c471b),
+ U64(0x28db77f523047d84),U64(0x32caab7b40c72493),
+ U64(0x3c9ebe0a15c9bebc),U64(0x431d67c49c100d4c),
+ U64(0x4cc5d4becb3e42b6),U64(0x597f299cfc657e2a),
+ U64(0x5fcb6fab3ad6faec),U64(0x6c44198c4a475817) };
+
+#ifndef PEDANTIC
+# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(__x86_64) || defined(__x86_64__)
+# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
+ asm ("bswapq %0" \
+ : "=r"(ret) \
+ : "0"(ret)); ret; })
+# endif
+# endif
+#endif
+
+#ifndef PULL64
+#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
+#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
+#endif
+
+#ifndef PEDANTIC
+# if defined(_MSC_VER)
+# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
+# define ROTR(a,n) _rotr64((a),n)
+# endif
+# elif defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(__x86_64) || defined(__x86_64__)
+# define ROTR(a,n) ({ unsigned long ret; \
+ asm ("rorq %1,%0" \
+ : "=r"(ret) \
+ : "J"(n),"0"(a) \
+ : "cc"); ret; })
+# elif defined(_ARCH_PPC) && defined(__64BIT__)
+# define ROTR(a,n) ({ unsigned long ret; \
+ asm ("rotrdi %0,%1,%2" \
+ : "=r"(ret) \
+ : "r"(a),"K"(n)); ret; })
+# endif
+# endif
+#endif
+
+#ifndef ROTR
+#define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
+#endif
+
+#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+
+#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#if defined(OPENSSL_IA32_SSE2) && !defined(OPENSSL_NO_ASM)
+#define GO_FOR_SSE2(ctx,in,num) do { \
+ extern int OPENSSL_ia32cap; \
+ void sha512_block_sse2(void *,const void *,size_t); \
+ if (!(OPENSSL_ia32cap & (1<<26))) break; \
+ sha512_block_sse2(ctx->h,in,num); return; \
+ } while (0)
+#endif
+
+#ifdef OPENSSL_SMALL_FOOTPRINT
+
+static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num)
+ {
+ const SHA_LONG64 *W=in;
+ SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2;
+ SHA_LONG64 X[16];
+ int i;
+
+#ifdef GO_FOR_SSE2
+ GO_FOR_SSE2(ctx,in,num);
+#endif
+
+ while (num--) {
+
+ a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
+ e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
+
+ for (i=0;i<16;i++)
+ {
+#ifdef B_ENDIAN
+ T1 = X[i] = W[i];
+#else
+ T1 = X[i] = PULL64(W[i]);
+#endif
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
+ T2 = Sigma0(a) + Maj(a,b,c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ for (;i<80;i++)
+ {
+ s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
+ s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
+
+ T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
+ T2 = Sigma0(a) + Maj(a,b,c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
+ ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
+
+ W+=SHA_LBLOCK;
+ }
+ }
+
+#else
+
+#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
+ T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
+ h = Sigma0(a) + Maj(a,b,c); \
+ d += T1; h += T1; } while (0)
+
+#define ROUND_16_80(i,a,b,c,d,e,f,g,h,X) do { \
+ s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
+ s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
+ T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
+ ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
+
+static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num)
+ {
+ const SHA_LONG64 *W=in;
+ SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1;
+ SHA_LONG64 X[16];
+ int i;
+
+#ifdef GO_FOR_SSE2
+ GO_FOR_SSE2(ctx,in,num);
+#endif
+
+ while (num--) {
+
+ a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
+ e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
+
+#ifdef B_ENDIAN
+ T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
+ T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
+ T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
+ T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
+ T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
+ T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
+ T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
+ T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
+ T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
+ T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
+ T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
+ T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
+ T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
+ T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
+ T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
+ T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
+#else
+ T1 = X[0] = PULL64(W[0]); ROUND_00_15(0,a,b,c,d,e,f,g,h);
+ T1 = X[1] = PULL64(W[1]); ROUND_00_15(1,h,a,b,c,d,e,f,g);
+ T1 = X[2] = PULL64(W[2]); ROUND_00_15(2,g,h,a,b,c,d,e,f);
+ T1 = X[3] = PULL64(W[3]); ROUND_00_15(3,f,g,h,a,b,c,d,e);
+ T1 = X[4] = PULL64(W[4]); ROUND_00_15(4,e,f,g,h,a,b,c,d);
+ T1 = X[5] = PULL64(W[5]); ROUND_00_15(5,d,e,f,g,h,a,b,c);
+ T1 = X[6] = PULL64(W[6]); ROUND_00_15(6,c,d,e,f,g,h,a,b);
+ T1 = X[7] = PULL64(W[7]); ROUND_00_15(7,b,c,d,e,f,g,h,a);
+ T1 = X[8] = PULL64(W[8]); ROUND_00_15(8,a,b,c,d,e,f,g,h);
+ T1 = X[9] = PULL64(W[9]); ROUND_00_15(9,h,a,b,c,d,e,f,g);
+ T1 = X[10] = PULL64(W[10]); ROUND_00_15(10,g,h,a,b,c,d,e,f);
+ T1 = X[11] = PULL64(W[11]); ROUND_00_15(11,f,g,h,a,b,c,d,e);
+ T1 = X[12] = PULL64(W[12]); ROUND_00_15(12,e,f,g,h,a,b,c,d);
+ T1 = X[13] = PULL64(W[13]); ROUND_00_15(13,d,e,f,g,h,a,b,c);
+ T1 = X[14] = PULL64(W[14]); ROUND_00_15(14,c,d,e,f,g,h,a,b);
+ T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
+#endif
+
+ for (i=16;i<80;i+=8)
+ {
+ ROUND_16_80(i+0,a,b,c,d,e,f,g,h,X);
+ ROUND_16_80(i+1,h,a,b,c,d,e,f,g,X);
+ ROUND_16_80(i+2,g,h,a,b,c,d,e,f,X);
+ ROUND_16_80(i+3,f,g,h,a,b,c,d,e,X);
+ ROUND_16_80(i+4,e,f,g,h,a,b,c,d,X);
+ ROUND_16_80(i+5,d,e,f,g,h,a,b,c,X);
+ ROUND_16_80(i+6,c,d,e,f,g,h,a,b,X);
+ ROUND_16_80(i+7,b,c,d,e,f,g,h,a,X);
+ }
+
+ ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
+ ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
+
+ W+=SHA_LBLOCK;
+ }
+ }
+
+#endif
diff --git a/crypto/sha/sha512t.c b/crypto/sha/sha512t.c
new file mode 100644
index 0000000000..bb93070e11
--- /dev/null
+++ b/crypto/sha/sha512t.c
@@ -0,0 +1,168 @@
+/* crypto/sha/sha512t.c */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ * ====================================================================
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <openssl/sha.h>
+
+unsigned char app_c1[SHA512_DIGEST_LENGTH] = {
+ 0xdd,0xaf,0x35,0xa1,0x93,0x61,0x7a,0xba,
+ 0xcc,0x41,0x73,0x49,0xae,0x20,0x41,0x31,
+ 0x12,0xe6,0xfa,0x4e,0x89,0xa9,0x7e,0xa2,
+ 0x0a,0x9e,0xee,0xe6,0x4b,0x55,0xd3,0x9a,
+ 0x21,0x92,0x99,0x2a,0x27,0x4f,0xc1,0xa8,
+ 0x36,0xba,0x3c,0x23,0xa3,0xfe,0xeb,0xbd,
+ 0x45,0x4d,0x44,0x23,0x64,0x3c,0xe8,0x0e,
+ 0x2a,0x9a,0xc9,0x4f,0xa5,0x4c,0xa4,0x9f };
+
+unsigned char app_c2[SHA512_DIGEST_LENGTH] = {
+ 0x8e,0x95,0x9b,0x75,0xda,0xe3,0x13,0xda,
+ 0x8c,0xf4,0xf7,0x28,0x14,0xfc,0x14,0x3f,
+ 0x8f,0x77,0x79,0xc6,0xeb,0x9f,0x7f,0xa1,
+ 0x72,0x99,0xae,0xad,0xb6,0x88,0x90,0x18,
+ 0x50,0x1d,0x28,0x9e,0x49,0x00,0xf7,0xe4,
+ 0x33,0x1b,0x99,0xde,0xc4,0xb5,0x43,0x3a,
+ 0xc7,0xd3,0x29,0xee,0xb6,0xdd,0x26,0x54,
+ 0x5e,0x96,0xe5,0x5b,0x87,0x4b,0xe9,0x09 };
+
+unsigned char app_c3[SHA512_DIGEST_LENGTH] = {
+ 0xe7,0x18,0x48,0x3d,0x0c,0xe7,0x69,0x64,
+ 0x4e,0x2e,0x42,0xc7,0xbc,0x15,0xb4,0x63,
+ 0x8e,0x1f,0x98,0xb1,0x3b,0x20,0x44,0x28,
+ 0x56,0x32,0xa8,0x03,0xaf,0xa9,0x73,0xeb,
+ 0xde,0x0f,0xf2,0x44,0x87,0x7e,0xa6,0x0a,
+ 0x4c,0xb0,0x43,0x2c,0xe5,0x77,0xc3,0x1b,
+ 0xeb,0x00,0x9c,0x5c,0x2c,0x49,0xaa,0x2e,
+ 0x4e,0xad,0xb2,0x17,0xad,0x8c,0xc0,0x9b };
+
+unsigned char app_d1[SHA384_DIGEST_LENGTH] = {
+ 0xcb,0x00,0x75,0x3f,0x45,0xa3,0x5e,0x8b,
+ 0xb5,0xa0,0x3d,0x69,0x9a,0xc6,0x50,0x07,
+ 0x27,0x2c,0x32,0xab,0x0e,0xde,0xd1,0x63,
+ 0x1a,0x8b,0x60,0x5a,0x43,0xff,0x5b,0xed,
+ 0x80,0x86,0x07,0x2b,0xa1,0xe7,0xcc,0x23,
+ 0x58,0xba,0xec,0xa1,0x34,0xc8,0x25,0xa7 };
+
+unsigned char app_d2[SHA384_DIGEST_LENGTH] = {
+ 0x09,0x33,0x0c,0x33,0xf7,0x11,0x47,0xe8,
+ 0x3d,0x19,0x2f,0xc7,0x82,0xcd,0x1b,0x47,
+ 0x53,0x11,0x1b,0x17,0x3b,0x3b,0x05,0xd2,
+ 0x2f,0xa0,0x80,0x86,0xe3,0xb0,0xf7,0x12,
+ 0xfc,0xc7,0xc7,0x1a,0x55,0x7e,0x2d,0xb9,
+ 0x66,0xc3,0xe9,0xfa,0x91,0x74,0x60,0x39 };
+
+unsigned char app_d3[SHA384_DIGEST_LENGTH] = {
+ 0x9d,0x0e,0x18,0x09,0x71,0x64,0x74,0xcb,
+ 0x08,0x6e,0x83,0x4e,0x31,0x0a,0x4a,0x1c,
+ 0xed,0x14,0x9e,0x9c,0x00,0xf2,0x48,0x52,
+ 0x79,0x72,0xce,0xc5,0x70,0x4c,0x2a,0x5b,
+ 0x07,0xb8,0xb3,0xdc,0x38,0xec,0xc4,0xeb,
+ 0xae,0x97,0xdd,0xd8,0x7f,0x3d,0x89,0x85 };
+
+int main ()
+{ unsigned char md[SHA512_DIGEST_LENGTH];
+ int i;
+ SHA512_CTX ctx;
+
+#ifdef OPENSSL_IA32_SSE2
+ { extern int OPENSSL_ia32cap;
+ char *env;
+
+ if (env=getenv("OPENSSL_ia32cap"))
+ OPENSSL_ia32cap = strtol (env,NULL,0);
+ }
+#endif
+
+ fprintf(stdout,"Testing SHA-512 ");
+
+ SHA512((unsigned char *)"abc",3,md);
+ if (memcmp(md,app_c1,sizeof(app_c1)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 1 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA512((unsigned char *)"abcdefgh""bcdefghi""cdefghij""defghijk"
+ "efghijkl""fghijklm""ghijklmn""hijklmno"
+ "ijklmnop""jklmnopq""klmnopqr""lmnopqrs"
+ "mnopqrst""nopqrstu",112,md);
+ if (memcmp(md,app_c2,sizeof(app_c2)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 2 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA512_Init(&ctx);
+ for (i=0;i<1000000;i+=288)
+ SHA512_Update(&ctx, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
+ (1000000-i)<288?1000000-i:288);
+ SHA512_Final(md,&ctx);
+
+ if (memcmp(md,app_c3,sizeof(app_c3)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 3 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ fprintf(stdout," passed.\n"); fflush(stdout);
+
+ fprintf(stdout,"Testing SHA-384 ");
+
+ SHA384((unsigned char *)"abc",3,md);
+ if (memcmp(md,app_d1,sizeof(app_d1)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 1 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA384((unsigned char *)"abcdefgh""bcdefghi""cdefghij""defghijk"
+ "efghijkl""fghijklm""ghijklmn""hijklmno"
+ "ijklmnop""jklmnopq""klmnopqr""lmnopqrs"
+ "mnopqrst""nopqrstu",112,md);
+ if (memcmp(md,app_d2,sizeof(app_d2)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 2 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ SHA384_Init(&ctx);
+ for (i=0;i<1000000;i+=64)
+ SHA512_Update(&ctx, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
+ "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
+ (1000000-i)<64?1000000-i:64);
+ SHA384_Final(md,&ctx);
+
+ if (memcmp(md,app_d3,sizeof(app_d3)))
+ { fflush(stdout);
+ fprintf(stderr,"\nTEST 3 of 3 failed.\n");
+ return 1;
+ }
+ else
+ fprintf(stdout,"."); fflush(stdout);
+
+ fprintf(stdout," passed.\n"); fflush(stdout);
+
+ return 0;
+}
diff --git a/doc/crypto/OPENSSL_Applink.pod b/doc/crypto/OPENSSL_Applink.pod
new file mode 100644
index 0000000000..f15fb695eb
--- /dev/null
+++ b/doc/crypto/OPENSSL_Applink.pod
@@ -0,0 +1,21 @@
+=pod
+
+=head1 NAME
+
+OPENSSL_Applink
+
+=head1 SYNOPSIS
+
+ __declspec(dllexport) void **OPENSSL_Applink();
+
+=head1 DESCRIPTION
+
+OPENSSL_Applink is application-side interface which provides a glue
+between OpenSSL BIO layer and Win32 compiler run-time environment.
+Even though it appears at application side, it's essentially OpenSSL
+private interface. For this reason application developers are not
+expected to implement it, but to compile provided module with
+compiler of their choice and link it into the target application.
+The referred module is available as <openssl>/ms/applink.c.
+
+=cut
diff --git a/ms/applink.c b/ms/applink.c
new file mode 100644
index 0000000000..4333d2639d
--- /dev/null
+++ b/ms/applink.c
@@ -0,0 +1,45 @@
+#define APPLINK_STDIN 1
+#define APPLINK_STDOUT 2
+#define APPLINK_STDERR 3
+#define APPLINK_FPRINTF 4
+#define APPLINK_FGETS 5
+#define APPLINK_FREAD 6
+#define APPLINK_FWRITE 7
+#define APPLINK_FSETMOD 8
+#define APPLINK_FEOF 9
+#define APPLINK_FCLOSE 10 /* should not be used */
+#define APPLINK_MAX 10 /* always same as last macro */
+
+#ifndef APPMACROS_ONLY
+#include <stdio.h>
+#include <io.h>
+#include <fcntl.h>
+
+static void *app_stdin() { return stdin; }
+static void *app_stdout() { return stdout; }
+static void *app_stderr() { return stderr; }
+static int app_feof(FILE *fp) { return feof(fp); }
+static int app_fsetmod(FILE *fp,char mod)
+{ return _setmode (_fileno(fp),mod=='b'?_O_BINARY:_O_TEXT); }
+
+__declspec(dllexport) void **OPENSSL_Applink()
+{ static int once=1;
+ static void *OPENSSL_ApplinkTable[APPLINK_MAX+1]={(void *)APPLINK_MAX};
+
+ if (once)
+ { OPENSSL_ApplinkTable[APPLINK_STDIN] = app_stdin;
+ OPENSSL_ApplinkTable[APPLINK_STDOUT] = app_stdout;
+ OPENSSL_ApplinkTable[APPLINK_STDERR] = app_stderr;
+ OPENSSL_ApplinkTable[APPLINK_FPRINTF] = fprintf;
+ OPENSSL_ApplinkTable[APPLINK_FGETS] = fgets;
+ OPENSSL_ApplinkTable[APPLINK_FREAD] = fread;
+ OPENSSL_ApplinkTable[APPLINK_FWRITE] = fwrite;
+ OPENSSL_ApplinkTable[APPLINK_FSETMOD] = app_fsetmod;
+ OPENSSL_ApplinkTable[APPLINK_FEOF] = app_feof;
+ OPENSSL_ApplinkTable[APPLINK_FCLOSE] = fclose;
+ once = 0;
+ }
+
+ return OPENSSL_ApplinkTable;
+}
+#endif
diff --git a/ms/uplink.c b/ms/uplink.c
new file mode 100644
index 0000000000..c839f9b087
--- /dev/null
+++ b/ms/uplink.c
@@ -0,0 +1,168 @@
+#if defined(_WIN64) && !defined(UNICODE)
+#define UNICODE
+#endif
+#if defined(UNICODE) && !defined(_UNICODE)
+#define _UNICODE
+#endif
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+#if defined(_MSC_VER) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0333 /* 3.51 */
+#endif
+
+#include <windows.h>
+#include <tchar.h>
+#include <stdio.h>
+#include <malloc.h>
+#include "uplink.h"
+
+#ifdef _MSC_VER
+#pragma comment(lib,"delayimp")
+/*
+ * CL command line should also be complemented with following:
+ *
+ * /link /delayload:advapi32.dll /delayload:user32.dll
+ *
+ * This is required if/as we want to support Win9x. With delayloaded
+ * DLLs in question all we have to do is to make sure NT-specific
+ * functions are not actually called under Win9x.
+ */
+#endif
+
+#if defined(_WIN32_WINNT) && _WIN32_WINNT>=0x0333
+int IsService()
+{ HWINSTA h;
+ DWORD len;
+ WCHAR *name;
+
+ GetDesktopWindow(); /* return value is ignored */
+
+ h = GetProcessWindowStation();
+ if (h==NULL) return -1;
+
+ if (GetUserObjectInformationW (h,UOI_NAME,NULL,0,&len) ||
+ GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+ return -1;
+
+ if (len>512) return -1; /* paranoia */
+ len++,len&=~1; /* paranoia */
+#ifdef _MSC_VER
+ name=(WCHAR *)_alloca(len+sizeof(WCHAR));
+#else
+ name=(WCHAR *)alloca(len+sizeof(WCHAR));
+#endif
+ if (!GetUserObjectInformationW (h,UOI_NAME,name,len,&len))
+ return -1;
+
+ len++,len&=~1; /* paranoia */
+ name[len/sizeof(WCHAR)]=L'\0'; /* paranoia */
+#if 1
+ /* This doesn't cover "interactive" services [working with real
+ * WinSta0's] nor programs started non-interactively by Task
+ * Scheduler [those are working with SAWinSta]. */
+ if (wcsstr(name,L"Service-0x")) return 1;
+#else
+ /* This covers all non-interactive programs such as services. */
+ if (!wcsstr(name,L"WinSta0")) return 1;
+#endif
+ else return 0;
+}
+#endif
+
+static TCHAR msg[128];
+
+static void unimplemented ()
+{
+#if defined(_WIN32_WINNT) && _WIN32_WINNT>=0x0333
+ /* this -------------v--- guards NT-specific calls */
+ if (GetVersion() < 0x80000000 && IsService())
+ { HANDLE h = RegisterEventSource(0,_T("OPENSSL"));
+ TCHAR *pmsg=msg;
+ ReportEvent(h,EVENTLOG_ERROR_TYPE,0,0,0,1,0,&pmsg,0);
+ DeregisterEventSource(h);
+ }
+ else
+#endif
+ { MSGBOXPARAMS m;
+
+ m.cbSize = sizeof(m);
+ m.hwndOwner = NULL;
+ m.lpszCaption = _T("OpenSSL: FATAL");
+ m.dwStyle = MB_OK;
+ m.hInstance = NULL;
+ m.lpszIcon = IDI_ERROR;
+ m.dwContextHelpId = 0;
+ m.lpfnMsgBoxCallback = NULL;
+ m.dwLanguageId = MAKELANGID(LANG_ENGLISH,SUBLANG_ENGLISH_US);
+ m.lpszText = msg;
+
+ MessageBoxIndirect (&m);
+ }
+ ExitProcess (1);
+}
+
+void OPENSSL_Uplink (void **table, int index)
+{ static HMODULE app=NULL;
+ static void **applinktable=NULL;
+ int len;
+
+ len = _stprintf (msg,_T("OPENSSL_Uplink(%p,%02X): "),table,index);
+ _tcscpy (msg+len,_T("unimplemented function"));
+ table [index] = unimplemented;
+
+ if (app==NULL && (app=GetModuleHandle(NULL))==NULL)
+ { app=(HMODULE)-1; _tcscpy (msg+len,_T("no host application"));
+ return;
+ }
+ else if (app==(HMODULE)-1) { return; }
+
+ if (applinktable==NULL)
+ { void**(*applink)();
+
+ applink=(void**(*)())GetProcAddress(app,"OPENSSL_Applink");
+ if (applink==NULL)
+ { app=(HMODULE)-1; _tcscpy (msg+len,_T("no OPENSSL_Applink"));
+ return;
+ }
+ applinktable = (*applink)();
+ if (applinktable==NULL)
+ { app=(HMODULE)-1; _tcscpy (msg+len,_T("no ApplinkTable"));
+ return;
+ }
+ }
+
+ if (index > (int)applinktable[0]) { return; }
+
+ if (applinktable[index]) table[index] = applinktable[index];
+}
+
+#if defined(_MSC_VER) && defined(_M_IX86)
+#define LAZY(i) \
+__declspec(naked) static void lazy##i () { \
+ _asm push i \
+ _asm push OFFSET OPENSSL_UplinkTable \
+ _asm call OPENSSL_Uplink \
+ _asm add esp,8 \
+ _asm jmp OPENSSL_UplinkTable+4*i }
+
+#if APPLINK_MAX>20
+#error "Add more stubs..."
+#endif
+/* make some in advance... */
+LAZY(1) LAZY(2) LAZY(3) LAZY(4) LAZY(5)
+LAZY(6) LAZY(7) LAZY(8) LAZY(9) LAZY(10)
+LAZY(11) LAZY(12) LAZY(13) LAZY(14) LAZY(15)
+LAZY(16) LAZY(17) LAZY(18) LAZY(19) LAZY(20)
+void *OPENSSL_UplinkTable[] = {
+ (void *)APPLINK_MAX,
+ lazy1, lazy2, lazy3, lazy4, lazy5,
+ lazy6, lazy7, lazy8, lazy9, lazy10,
+ lazy11,lazy12,lazy13,lazy14,lazy15,
+ lazy16,lazy17,lazy18,lazy19,lazy20,
+};
+#endif
+
+#ifdef SELFTEST
+main() { UP_fprintf(UP_stdout,"hello, world!\n"); }
+#endif
diff --git a/ms/uplink.h b/ms/uplink.h
new file mode 100644
index 0000000000..3e9911ab93
--- /dev/null
+++ b/ms/uplink.h
@@ -0,0 +1,14 @@
+#define APPMACROS_ONLY
+#include "applink.c"
+
+extern void *OPENSSL_UplinkTable[];
+#define UP_stdin (*(void *(*)())OPENSSL_UplinkTable[APPLINK_STDIN])()
+#define UP_stdout (*(void *(*)())OPENSSL_UplinkTable[APPLINK_STDOUT])()
+#define UP_stderr (*(void *(*)())OPENSSL_UplinkTable[APPLINK_STDERR])()
+#define UP_fprintf (*(int (*)(void *,const char *,...))OPENSSL_UplinkTable[APPLINK_FPRINTF])
+#define UP_fgets (*(char *(*)(char *,int,void *))OPENSSL_UplinkTable[APPLINK_FGETS])
+#define UP_fread (*(size_t (*)(void *,size_t,size_t,void *))OPENSSL_UplinkTable[APPLINK_FREAD])
+#define UP_fwrite (*(size_t (*)(void *,size_t,size_t,void *))OPENSSL_UplinkTable[APPLINK_FWRITE])
+#define UP_fsetmod (*(int (*)(void *,char))OPENSSL_UplinkTable[APPLINK_FSETMOD])
+#define UP_feof (*(int (*)(void *))OPENSSL_UplinkTable[APPLINK_FEOF])
+#define UP_fclose (*(int (*)(void *))OPENSSL_Uplink[APPLINK_FCLOSE])
diff --git a/ms/uplink.pl b/ms/uplink.pl
new file mode 100755
index 0000000000..801f6e01fa
--- /dev/null
+++ b/ms/uplink.pl
@@ -0,0 +1,177 @@
+#!/usr/bin/env perl
+#
+# For Microsoft CL this is implemented as inline assembler. So that
+# even though this script can generate even Win32 code, we'll be
+# using it primarily to generate Win64 modules. Both IA-64 and AMD64
+# are supported...
+
+# pull APPLINK_MAX value from applink.c...
+$applink_c=$0;
+$applink_c=~s|[^/\\]+$||g;
+$applink_c.="applink.c";
+open(INPUT,$applink_c) || die "can't open $applink_c: $!";
+@max=grep {/APPLINK_MAX\s+(\d+)/} <INPUT>;
+close(INPUT);
+($#max==0) or die "can't find APPLINK_MAX in $applink_c";
+
+$max[0]=~/APPLINK_MAX\s+(\d+)/;
+$N=$1; # number of entries in OPENSSL_UplinkTable not including
+ # OPENSSL_UplinkTable[0], which contains this value...
+
+# Idea is to fill the OPENSSL_UplinkTable with pointers to stubs
+# which invoke 'void OPENSSL_Uplink (ULONG_PTR *table,int index)';
+# and then dereference themselves. Latter shall result in endless
+# loop *unless* OPENSSL_Uplink does not replace 'table[index]' with
+# something else, e.g. as 'table[index]=unimplemented;'...
+
+$arg = shift;
+#( defined shift || open STDOUT,">$arg" ) || die "can't open $arg: $!";
+
+if ($arg =~ /win32n/) { ia32nasm(); }
+elsif ($arg =~ /win32/) { ia32masm(); }
+elsif ($arg =~ /ia64/) { ia64ias(); }
+elsif ($arg =~ /amd64/) { amd64masm(); }
+else { die "nonsense $arg"; }
+
+sub ia32masm() {
+print <<___;
+.386P
+.model FLAT
+
+_DATA SEGMENT
+PUBLIC _OPENSSL_UplinkTable
+_OPENSSL_UplinkTable DD $N ; amount of following entries
+___
+for ($i=1;$i<=$N;$i++) { print " DD FLAT:\$lazy$i\n"; }
+print <<___;
+_DATA ENDS
+
+_TEXT SEGMENT
+EXTRN _OPENSSL_Uplink:NEAR
+___
+for ($i=1;$i<=$N;$i++) {
+print <<___;
+ALIGN 4
+\$lazy$i PROC NEAR
+ push $i
+ push OFFSET FLAT:_OPENSSL_UplinkTable
+ call _OPENSSL_Uplink
+ add esp,8
+ jmp DWORD PTR _OPENSSL_UplinkTable+4*$i
+\$lazy$i ENDP
+___
+}
+print <<___;
+ALIGN 4
+_TEXT ENDS
+END
+___
+}
+
+sub ia32nasm() {
+print <<___;
+SEGMENT .data
+GLOBAL _OPENSSL_UplinkTable
+_OPENSSL_UplinkTable DD $N ; amount of following entries
+___
+for ($i=1;$i<=$N;$i++) { print " DD \$lazy$i\n"; }
+print <<___;
+
+SEGMENT .text
+EXTERN _OPENSSL_Uplink
+___
+for ($i=1;$i<=$N;$i++) {
+print <<___;
+ALIGN 4
+\$lazy$i:
+ push $i
+ push _OPENSSL_UplinkTable
+ call _OPENSSL_Uplink
+ add esp,8
+ jmp [_OPENSSL_UplinkTable+4*$i]
+___
+}
+print <<___;
+ALIGN 4
+END
+___
+}
+
+sub ia64ias () {
+local $V=8; # max number of args uplink functions may accept...
+print <<___;
+.data
+.global OPENSSL_UplinkTable#
+OPENSSL_UplinkTable: data8 $N // amount of following entries
+___
+for ($i=1;$i<=$N;$i++) { print " data8 \@fptr(lazy$i#)\n"; }
+print <<___;
+.size OPENSSL_UplinkTable,.-OPENSSL_UplinkTable#
+
+.text
+.global OPENSSL_Uplink#
+.type OPENSSL_Uplink#,\@function
+___
+for ($i=1;$i<=$N;$i++) {
+print <<___;
+.proc lazy$i
+lazy$i:
+{ .mii; alloc loc0=ar.pfs,$V,3,2,0
+ mov loc1=b0
+ addl loc2=\@ltoff(OPENSSL_UplinkTable#),gp };;
+{ .mmi; ld8 out0=[loc2]
+ mov out1=$i };;
+{ .mib; adds loc2=8*$i,out0
+ br.call.sptk.many b0=OPENSSL_Uplink# };;
+{ .mmi; ld8 r31=[loc2];;
+ ld8 r30=[r31],8 };;
+{ .mii; ld8 gp=[r31]
+ mov b6=r30
+ mov b0=loc1 };;
+{ .mib; mov ar.pfs=loc0
+ br.many b6 };;
+.endp lazy$i#
+___
+}
+}
+
+sub amd64masm() {
+print <<___;
+_DATA SEGMENT
+PUBLIC OPENSSL_UplinkTable
+OPENSSL_UplinkTable DQ $N
+___
+for ($i=1;$i<=$N;$i++) { print " DQ FLAT:\$lazy$i\n"; }
+print <<___;
+_DATA ENDS
+
+TEXT SEGMENT
+EXTERN OPENSSL_Uplink:NEAR
+___
+for ($i=1;$i<=$N;$i++) {
+print <<___;
+ALIGN 4
+\$lazy$i PROC NEAR
+ push r9
+ push r8
+ push rdx
+ push rcx
+ sub rsp,40
+ mov rcx,OFFSET FLAT:OPENSSL_UplinkTable
+ mov rdx,$i
+ call OPENSSL_Uplink
+ add rsp,40
+ pop rcx
+ pop rdx
+ pop r8
+ pop r9
+ jmp QWORD PTR OPENSSL_UplinkTable+8*$i
+\$lazy$i ENDP
+___
+}
+print <<___;
+TEXT ENDS
+END
+___
+}
+