1 files changed, 672 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/checksum/power8/vec_crc32.c b/src/third_party/wiredtiger/src/checksum/power8/vec_crc32.c
new file mode 100644
index 00000000000..4356d505007
--- /dev/null
+++ b/src/third_party/wiredtiger/src/checksum/power8/vec_crc32.c
@@ -0,0 +1,672 @@
+#include <wiredtiger_config.h>
+#if defined(__powerpc64__) && !defined(HAVE_NO_CRC32_HARDWARE)
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ *
+ * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+
+#include <altivec.h>
+
+#define POWER8_INTRINSICS
+#define CRC_TABLE
+
+#include "crc32_constants.h"
+
+#define VMX_ALIGN 16UL
+#define VMX_ALIGN_MASK (VMX_ALIGN - 1)
+
+#ifdef REFLECT
+/*
+ * crc32_align --
+ *     Align helper for CRC32 functions.
+ */
+static unsigned int
+crc32_align(unsigned int crc, const unsigned char *p, unsigned long len)
+{
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+#else
+/*
+ * crc32_align --
+ *     Align helper for CRC32 functions.
+ */
+static unsigned int
+crc32_align(unsigned int crc, const unsigned char *p, unsigned long len)
+{
+    while (len--)
+        crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
+    return crc;
+}
+#endif
+
+static unsigned int __attribute__((aligned(32)))
+__crc32_vpmsum(unsigned int crc, const void *p, unsigned long len);
+
+/* -Werror=missing-prototypes */
+unsigned int crc32_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len);
+
+/*
+ * crc32_vpmsum --
+ *     VPM sum helper for CRC32 functions.
+ */
+unsigned int
+crc32_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len)
+{
+    unsigned int prealign;
+    unsigned int tail;
+
+#ifdef CRC_XOR
+    crc ^= 0xffffffff;
+#endif
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += len & ~VMX_ALIGN_MASK;
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+#ifdef CRC_XOR
+    crc ^= 0xffffffff;
+#endif
+
+    return crc;
+}
+
+#if defined(__clang__)
+#include "clang_workaround.h"
+#else
+#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128((a), (b))
+#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128((vector __int128_t)(a), 0)
+#ifndef REFLECT
+#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128((vector __int128_t)(a), 1)
+#endif
+#endif
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that foward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char)vc)
+#if defined(__LITTLE_ENDIAN__)
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const
+  __attribute__((aligned(16))) = {0x08090A0B0C0D0E0FUL, 0x0001020304050607UL};
+#else
+static const __vector unsigned long long vperm_const
+  __attribute__((aligned(16))) = {0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL};
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int __attribute__((aligned(32)))
+__crc32_vpmsum(unsigned int crc, const void *p, unsigned long len)
+{
+
+    const __vector unsigned long long vzero = {0, 0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+#ifdef REFLECT
+    const __vector unsigned long long vmask_32bit = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+#endif
+
+    const __vector unsigned long long vmask_64bit = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0, 0};
+    __vector unsigned long long v1 = {0, 0};
+    __vector unsigned long long v2 = {0, 0};
+    __vector unsigned long long v3 = {0, 0};
+    __vector unsigned long long v4 = {0, 0};
+    __vector unsigned long long v5 = {0, 0};
+    __vector unsigned long long v6 = {0, 0};
+    __vector unsigned long long v7 = {0, 0};
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int result = 0;
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+#ifdef REFLECT
+    __vector unsigned char vsht_splat;
+#endif
+
+#ifdef REFLECT
+    vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
+#else
+    vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
+
+    /* Shift into top 32 bits */
+    vcrc = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)vcrc, (__vector unsigned char)vzero, 4);
+#endif
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long *)p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value*/
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long *)p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+              (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long *)p);
+        vdata1 = vec_ld(16, (__vector unsigned long long *)p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long *)p);
+        vdata3 = vec_ld(48, (__vector unsigned long long *)p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long *)p);
+        vdata5 = vec_ld(80, (__vector unsigned long long *)p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long *)p);
+        vdata7 = vec_ld(112, (__vector unsigned long long *)p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each constant is 16 bytes,
+             * and it is used against 128 bytes of input data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE / 8) - (block_size / 8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size / 128) - 1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata0, (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata1, (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata2, (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata3, (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd(
+              (__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long *)p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long *)p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long *)p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long *)p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long *)p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long *)p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long *)p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long *)p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. We modulo schedule it such that it takes three iterations to complete
+                 * - first iteration load, second iteration vpmsum, third iteration xor.
+                 */
+                for (i = 0; i < chunks - 2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata0, (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata1, (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata3, (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd(
+                      (__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long *)p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down*/
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata0, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata1, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata2, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata3, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd(
+                  (__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1);
+            } /* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+#ifdef REFLECT
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits of the register. Since
+             * we are bit reflected we have to shift it left 32 bits so it occupies the least
+             * significant bits in the bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v0, (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v1, (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v2, (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v3, (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v4, (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v5, (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v6, (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld(
+              (__vector unsigned char)v7, (__vector unsigned char)vzero, 4);
+#endif
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long *)p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long *)p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long *)p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long *)p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long *)p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long *)p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long *)p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long *)p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+          (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i += 16) {
+            vdata0 = vec_ld(i, (__vector unsigned long long *)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i, vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+              (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)v0, (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1, v0);
+
+#ifdef REFLECT
+    /* shift left one bit */
+    vsht_splat = vec_splat_u8(1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+#endif
+
+    v0 = vec_and(v0, vmask_64bit);
+
+#ifndef REFLECT
+
+    /*
+     * Now for the actual algorithm. The idea is to calculate q, the multiple of our polynomial that
+     * we need to subtract. By doing the computation 2x bits higher (ie 64 bits) and shifting the
+     * result back down 2x bits, we round down to the nearest multiple.
+     */
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd(
+      (__vector unsigned long long)v0, (__vector unsigned long long)vconst1);
+    /* q = floor(ma/(2^64)) */
+    v1 = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)vzero, (__vector unsigned char)v1, 8);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd(
+      (__vector unsigned long long)v1, (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor(v0, v1);
+    /*
+     * Get the result into r3. We need to shift it left 8 bytes: V0 [ 0 1 2 X ] V0 [ 0 X 2 3 ]
+     */
+    result = __builtin_unpack_vector_1(v0);
+#else
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit reflecting our data (which is
+     * expensive to do), we bit reflect our constants and our algorithm, which means the
+     * intermediate data in our vector registers goes from 0-63 instead of 63-0. We can reflect the
+     * algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd(
+      (__vector unsigned long long)v1, (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd(
+      (__vector unsigned long long)v1, (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor(v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in the high 32 bits. We just
+     * need to shift it left 4 bytes V0 [ 0 1 X 3 ] V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld(
+      (__vector unsigned char)v0, (__vector unsigned char)vzero, 4);
+
+    result = __builtin_unpack_vector_0(v0);
+#endif
+
+    return result;
+}
+#endif