Speedup BLAKE2s message loading on PowerPC

author: Jeffrey Walton <noloader@gmail.com> 2020-06-29 05:17:59 -0400
committer: Jeffrey Walton <noloader@gmail.com> 2020-06-29 05:17:59 -0400
commit: 827f2ebcadd02b3474bdbda23a4d0c733ace8f3b (patch)
tree: dff0bd3c121841e8ec373a224f551c6517a2eda2 /blake2s_simd.cpp
parent: a3aefbb1dcd35260718e0f78aafd3ac7d68b6aeb (diff)
download: cryptopp-git-827f2ebcadd02b3474bdbda23a4d0c733ace8f3b.tar.gz
1 files changed, 56 insertions, 4 deletions
diff --git a/blake2s_simd.cpp b/blake2s_simd.cpp
index a2c641b6..2f67f5c8 100644
--- a/blake2s_simd.cpp
+++ b/blake2s_simd.cpp
@@ -992,12 +992,64 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
       BLAKE2S_G2(row1,row2,row3,row4,buf4); \
       BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
 
+    /* Possibly unaligned user messages */
+    uint32x4_p m0, m4, m8, m12;
+    /* Endian conversion mask */
     const uint8x16_p le_mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
 
-    const uint32x4_p  m0 = VecLoad32LE(input +  0, le_mask);
-    const uint32x4_p  m4 = VecLoad32LE(input + 16, le_mask);
-    const uint32x4_p  m8 = VecLoad32LE(input + 32, le_mask);
-    const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
+#if defined(_ARCH_PWR9)
+    /* POWER9 provides loads for char's and short's */
+    m0 = (uint32x4_p) vec_xl(  0, CONST_V8_CAST( input ));
+    m4 = (uint32x4_p) vec_xl( 16, CONST_V8_CAST( input ));
+    m8 = (uint32x4_p) vec_xl( 32, CONST_V8_CAST( input ));
+    m12 = (uint32x4_p) vec_xl( 48, CONST_V8_CAST( input ));
+
+# if defined(CRYPTOPP_BIG_ENDIAN)
+    m0 = vec_perm(m0, m0, le_mask);
+    m4 = vec_perm(m4, m4, le_mask);
+    m8 = vec_perm(m8, m8, le_mask);
+    m12 = vec_perm(m12, m12, le_mask);
+# endif
+#else
+    /* Altivec only provides 16-byte aligned loads */
+    /* http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf, Section 3.16 */
+    m0 = (uint32x4_p) vec_ld(  0, CONST_V8_CAST( input ));
+    m4 = (uint32x4_p) vec_ld( 16, CONST_V8_CAST( input ));
+    m8 = (uint32x4_p) vec_ld( 32, CONST_V8_CAST( input ));
+    m12 = (uint32x4_p) vec_ld( 48, CONST_V8_CAST( input ));
+
+    /* Alignment check for load of the message buffer */
+    const uintptr_t addr = (uintptr_t)input;
+    if (addr%16 == 0)
+    {
+        /* Already aligned. Perform a little-endian swap as required */
+# if defined(CRYPTOPP_BIG_ENDIAN)
+        m0 = vec_perm(m0, m0, le_mask);
+        m4 = vec_perm(m4, m4, le_mask);
+        m8 = vec_perm(m8, m8, le_mask);
+        m12 = vec_perm(m12, m12, le_mask);
+# endif
+    }
+    else
+    {
+        /* Not aligned. Fix vectors and perform a little-endian swap as required */
+        // http://mirror.informatimago.com/next/developer.apple.com/
+        //        hardwaredrivers/ve/code_optimization.html
+        uint32x4_p ex; uint8x16_p perm;
+        ex = (uint32x4_p) vec_ld(48+15, CONST_V8_CAST( input ));
+        perm = vec_lvsl(0, CONST_V8_CAST( addr ));
+
+# if defined(CRYPTOPP_BIG_ENDIAN)
+        /* Combine the vector permute with the little-endian swap */
+        perm = vec_perm(perm, perm, le_mask);
+# endif
+
+        m0 = vec_perm(m0, m4, perm);
+        m4 = vec_perm(m4, m8, perm);
+        m8 = vec_perm(m8, m12, perm);
+        m12 = vec_perm(m12, ex, perm);
+    }
+#endif
 
     uint32x4_p row1, row2, row3, row4;
     uint32x4_p buf1, buf2, buf3, buf4;
author	Jeffrey Walton <noloader@gmail.com>	2020-06-29 05:17:59 -0400
committer	Jeffrey Walton <noloader@gmail.com>	2020-06-29 05:17:59 -0400
commit	827f2ebcadd02b3474bdbda23a4d0c733ace8f3b (patch)
tree	dff0bd3c121841e8ec373a224f551c6517a2eda2 /blake2s_simd.cpp
parent	a3aefbb1dcd35260718e0f78aafd3ac7d68b6aeb (diff)
download	cryptopp-git-827f2ebcadd02b3474bdbda23a4d0c733ace8f3b.tar.gz