1 files changed, 640 insertions, 0 deletions
diff --git a/ext/hash/murmur/PMurHash128.c b/ext/hash/murmur/PMurHash128.c
new file mode 100644
index 0000000000..2856542190
--- /dev/null
+++ b/ext/hash/murmur/PMurHash128.c
@@ -0,0 +1,640 @@
+/*-----------------------------------------------------------------------------
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain.
+ *
+ * This is a c++ implementation of MurmurHash3_128 with support for progressive
+ * processing based on PMurHash implementation written by Shane Day.
+ */
+
+/*-----------------------------------------------------------------------------
+
+If you want to understand the MurmurHash algorithm you would be much better
+off reading the original source. Just point your browser at:
+http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+
+
+What this version provides?
+
+1. Progressive data feeding. Useful when the entire payload to be hashed
+does not fit in memory or when the data is streamed through the application.
+Also useful when hashing a number of strings with a common prefix. A partial
+hash of a prefix string can be generated and reused for each suffix string.
+
+How does it work?
+
+We can only process entire 128 bit chunks of input, except for the very end
+that may be shorter. So along with the partial hash we need to give back to
+the caller a carry containing up to 15 bytes that we were unable to process.
+This carry also needs to record the number of bytes the carry holds. I use
+the low 4 bits as a count (0..15) and the carry bytes are shifted into the
+high byte in stream order.
+
+To handle endianess I simply use a macro that reads an uint and define
+that macro to be a direct read on little endian machines, a read and swap
+on big endian machines.
+
+-----------------------------------------------------------------------------*/
+
+
+#include "PMurHash128.h"
+
+/*-----------------------------------------------------------------------------
+ * Endianess, misalignment capabilities and util macros
+ *
+ * The following 5 macros are defined in this section. The other macros defined
+ * are only needed to help derive these 5.
+ *
+ * READ_UINT32(x,i) Read a little endian unsigned 32-bit int at index
+ * READ_UINT64(x,i) Read a little endian unsigned 64-bit int at index
+ * UNALIGNED_SAFE   Defined if READ_UINTXX works on non-word boundaries
+ * ROTL32(x,r)      Rotate x left by r bits
+ * ROTL64(x,r)      Rotate x left by r bits
+ * BIG_CONSTANT
+ * FORCE_INLINE
+ */
+
+/* I386 or AMD64 */
+#if defined(_M_I86) || defined(_M_IX86) || defined(_X86_) || defined(__i386__) || defined(__i386) || defined(i386) \
+ || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64)
+  #define UNALIGNED_SAFE
+#endif
+
+/* Find best way to ROTL */
+#if defined(_MSC_VER)
+  #define FORCE_INLINE  static __forceinline
+  #include <stdlib.h>  /* Microsoft put _rotl declaration in here */
+  #define ROTL32(x,y)  _rotl(x,y)
+  #define ROTL64(x,y)  _rotl64(x,y)
+  #define BIG_CONSTANT(x) (x)
+#else
+  #define FORCE_INLINE static inline __attribute__((always_inline))
+  /* gcc recognises this code and generates a rotate instruction for CPUs with one */
+  #define ROTL32(x,r)  (((uint32_t)x << r) | ((uint32_t)x >> (32 - r)))
+  #define ROTL64(x,r)  (((uint64_t)x << r) | ((uint64_t)x >> (64 - r)))
+  #define BIG_CONSTANT(x) (x##LLU)
+#endif
+
+#include "endianness.h"
+
+#define READ_UINT64(ptr,i) getblock64((uint64_t *)ptr,i)
+#define READ_UINT32(ptr,i) getblock32((uint32_t *)ptr,i)
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+/*-----------------------------------------------------------------------------*
+                                 PMurHash128x86
+ *-----------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------------------
+ * Core murmurhash algorithm macros */
+
+static const uint32_t kC1 = 0x239b961b;
+static const uint32_t kC2 = 0xab0e9789;
+static const uint32_t kC3 = 0x38b34ae5;
+static const uint32_t kC4 = 0xa1e38b93;
+
+/* This is the main processing body of the algorithm. It operates
+ * on each full 128-bits of input. */
+#define doblock128x86(h1, h2, h3, h4, k1, k2, k3,k4)\
+do {\
+  k1 *= kC1; k1  = ROTL32(k1,15); k1 *= kC2; h1 ^= k1;\
+\
+  h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;\
+\
+  k2 *= kC2; k2  = ROTL32(k2,16); k2 *= kC3; h2 ^= k2;\
+\
+  h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;\
+\
+  k3 *= kC3; k3  = ROTL32(k3,17); k3 *= kC4; h3 ^= k3;\
+\
+  h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;\
+\
+  k4 *= kC4; k4  = ROTL32(k4,18); k4 *= kC1; h4 ^= k4;\
+\
+  h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;\
+} while(0)
+
+/* Append unaligned bytes to carry, forcing hash churn if we have 16 bytes */
+/* cnt=bytes to process, h1-h4=hash k1-k4=carry, n=bytes in carry, ptr/len=payload */
+#define dobytes128x86(cnt, h1, h2, h3, h4, k1, k2, k3, k4, n, ptr, len)\
+do {\
+  unsigned __cnt = cnt;\
+  for(;__cnt--; len--) {\
+    switch(n) {\
+      case  0: case  1: case  2: case  3:\
+        k1 = k1>>8 | (uint32_t)*ptr++<<24;\
+        ++n; break;\
+\
+      case  4: case  5: case  6: case  7:\
+        k2 = k2>>8 | (uint32_t)*ptr++<<24;\
+        ++n; break;\
+\
+      case  8: case  9: case 10: case 11:\
+        k3 = k3>>8 | (uint32_t)*ptr++<<24;\
+        ++n; break;\
+\
+      case 12: case 13: case 14:\
+        k4 = k4>>8 | (uint32_t)*ptr++<<24;\
+        ++n; break;\
+\
+      case 15:\
+        k4 = k4>>8 | (uint32_t)*ptr++<<24;\
+        doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);\
+        n = 0; break;\
+    }\
+  }\
+} while(0)
+
+/* Finalize a hash. To match the original Murmur3_128x86 the total_length must be provided */
+void PMurHash128x86_Result(const uint32_t *ph, const uint32_t *pcarry, uint32_t total_length, uint32_t *out)
+{
+  uint32_t h1 = ph[0];
+  uint32_t h2 = ph[1];
+  uint32_t h3 = ph[2];
+  uint32_t h4 = ph[3];
+
+  uint32_t k1, k2, k3, k4 = pcarry[3];
+
+  int n = k4 & 15;
+  switch(n) {
+    case  1: case  2: case  3: case  4:
+      k1 = pcarry[0] >> (4-n)*8;
+      goto finrot_k1;
+
+    case  5: case  6: case  7: case  8:
+      k2 = pcarry[1] >> (8-n)*8;
+      goto finrot_k21;
+
+    case  9: case 10: case 11: case 12:
+      k3 = pcarry[2] >> (12-n)*8;
+      goto finrot_k321;
+
+    case 13: case 14: case 15:
+      k4 >>= (16-n)*8;
+      goto finrot_k4321;
+
+    default:
+      goto skiprot;
+  }
+finrot_k4321:
+  k4 *= kC4; k4  = ROTL32(k4,18); k4 *= kC1; h4 ^= k4;
+  k3 = pcarry[2];
+finrot_k321:
+  k3 *= kC3; k3  = ROTL32(k3,17); k3 *= kC4; h3 ^= k3;
+  k2 = pcarry[1];
+finrot_k21:
+  k2 *= kC2; k2  = ROTL32(k2,16); k2 *= kC3; h2 ^= k2;
+  k1 = pcarry[0];
+finrot_k1:
+  k1 *= kC1; k1  = ROTL32(k1,15); k1 *= kC2; h1 ^= k1;
+skiprot:
+
+  //----------
+  // finalization
+
+  h1 ^= total_length; h2 ^= total_length;
+  h3 ^= total_length; h4 ^= total_length;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  out[0] = h1;
+  out[1] = h2;
+  out[2] = h3;
+  out[3] = h4;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Main hashing function. Initialise carry[4] to {0,0,0,0} and h[4] to an initial {seed,seed,seed,seed}
+ * if wanted. Both ph and pcarry are required arguments. */
+void PMurHash128x86_Process(uint32_t * const ph, uint32_t * const pcarry, const void * const key, int len)
+{
+  uint32_t h1 = ph[0];
+  uint32_t h2 = ph[1];
+  uint32_t h3 = ph[2];
+  uint32_t h4 = ph[3];
+
+  uint32_t k1 = pcarry[0];
+  uint32_t k2 = pcarry[1];
+  uint32_t k3 = pcarry[2];
+  uint32_t k4 = pcarry[3];
+
+  const uint8_t *ptr = (uint8_t*)key;
+  const uint8_t *end;
+
+  /* Extract carry count from low 4 bits of c value */
+  int n = k4 & 15;
+
+#if defined(UNALIGNED_SAFE)
+  /* This CPU handles unaligned word access */
+// #pragma message ( "UNALIGNED_SAFE" )
+  /* Consume any carry bytes */
+  int i = (16-n) & 15;
+  if(i && i <= len) {
+    dobytes128x86(i, h1, h2, h3, h4, k1, k2, k3, k4, n, ptr, len);
+  }
+
+  /* Process 128-bit chunks */
+  end = ptr + (len & ~15);
+  for( ; ptr < end ; ptr+=16) {
+    k1 = READ_UINT32(ptr, 0);
+    k2 = READ_UINT32(ptr, 1);
+    k3 = READ_UINT32(ptr, 2);
+    k4 = READ_UINT32(ptr, 3);
+    doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);
+  }
+
+#else /*UNALIGNED_SAFE*/
+  /* This CPU does not handle unaligned word access */
+// #pragma message ( "ALIGNED" )
+  /* Consume enough so that the next data byte is word aligned */
+  int i = -(intptr_t)(void *)ptr & 3;
+  if(i && i <= len) {
+    dobytes128x86(i, h1, h2, h3, h4, k1, k2, k3, k4, n, ptr, len);
+  }
+  /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */
+  end = ptr + (len & ~15);
+
+  switch(n) { /* how many bytes in c */
+  case 0: /*
+  k1=[----] k2=[----] k2=[----] k4=[----] w=[3210 7654 ba98 fedc] b=[3210 7654 ba98 fedc] */
+    for( ; ptr < end ; ptr+=16) {
+      k1 = READ_UINT32(ptr, 0);
+      k2 = READ_UINT32(ptr, 1);
+      k3 = READ_UINT32(ptr, 2);
+      k4 = READ_UINT32(ptr, 3);
+      doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);
+    }
+    break;
+  case 1: case 2: case 3: /*
+  k1=[10--] k2=[----] k3=[----] k4=[----] w=[5432 9876 dcba hgfe] b=[3210 7654 ba98 fedc] k1'=[hg--] */
+    {
+      const int lshift = n*8, rshift = 32-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint32_t c = k1>>rshift;      // --10
+        k2 = READ_UINT32(ptr, 0);     // 5432
+        c |= k2<<lshift;              // 3210.
+        k1 = READ_UINT32(ptr, 1);     // 9876
+        k2 = k1<<lshift | k2>>rshift; // 7654.
+        k4 = READ_UINT32(ptr, 2);     // dcba
+        k3 = k4<<lshift | k1>>rshift; // ba98.
+        k1 = READ_UINT32(ptr, 3);     // hgfe.
+        k4 = k1<<lshift | k4>>rshift; // fedc.
+        doblock128x86(h1, h2, h3, h4, c, k2, k3, k4);
+      }
+    }
+    break;
+  case 4: /*
+  k1=[3210] k2=[----] k3=[----] k4=[----] w=[7654 ba98 fedc jihg] b=[3210 7654 ba98 fedc] k1'=[jihg] */
+    for( ; ptr < end ; ptr+=16) {
+      k2 = READ_UINT32(ptr, 0);
+      k3 = READ_UINT32(ptr, 1);
+      k4 = READ_UINT32(ptr, 2);
+      doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);
+      k1 = READ_UINT32(ptr, 3);
+    }
+    break;
+  case 5: case 6: case 7: /*
+  k1=[3210] k2=[54--] k3=[----] k4=[----] w=[9876 dcba hgfe lkji] b=[3210 7654 ba98 fedc] k1'=[jihg] k2'=[lk--] */
+    {
+      const int lshift = n*8-32, rshift = 32-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint32_t c = k2>>rshift;      // --54
+        k3 = READ_UINT32(ptr, 0);     // 9876
+        c |= k3<<lshift;              // 7654.
+        k4 = READ_UINT32(ptr, 1);     // dcba
+        k3 = k4<<lshift | k3>>rshift; // ba98.
+        k2 = READ_UINT32(ptr, 2);     // hgfe
+        k4 = k2<<lshift | k4>>rshift; // fedc.
+        doblock128x86(h1, h2, h3, h4, k1, c, k3, k4);
+        k1 = k2>>rshift;              // --hg
+        k2 = READ_UINT32(ptr, 3);     // lkji.
+        k1 |= k2<<lshift;             // jihg.
+      }
+    }
+  case 8: /*
+  k1=[3210] k2=[7654] k3=[----] k4=[----] w=[ba98 fedc jihg nmlk] b=[3210 7654 ba98 fedc] k1'=[jihg] k2'=[nmlk] */
+    for( ; ptr < end ; ptr+=16) {
+      k3 = READ_UINT32(ptr, 0);
+      k4 = READ_UINT32(ptr, 1);
+      doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);
+      k1 = READ_UINT32(ptr, 2);
+      k2 = READ_UINT32(ptr, 3);
+    }
+    break;
+  case 9: case 10: case 11: /*
+  k1=[3210] k2=[7654] k3=[98--] k4=[----] w=[dcba hgfe lkji ponm] b=[3210 7654 ba98 fedc] k1'=[jihg] k2'=[nmlk] k3'=[po--] */
+    {
+      const int lshift = n*8-64, rshift = 32-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint32_t c = k3>>rshift;      // --98
+        k4 = READ_UINT32(ptr, 0);     // dcba
+        c |= k4<<lshift;              // ba98.
+        k3 = READ_UINT32(ptr, 1);     // hgfe
+        k4 = k3<<lshift | k4>>rshift; // fedc.
+        doblock128x86(h1, h2, h3, h4, k1, k2, c, k4);
+        k2 = READ_UINT32(ptr, 2);     // lkji
+        k1 = k2<<lshift | k3>>rshift; // jihg.
+        k3 = READ_UINT32(ptr, 3);     // ponm.
+        k2 = k3<<lshift | k2>>rshift; // nmlk.
+      }
+    }
+  case 12: /*
+  k1=[3210] k2=[7654] k3=[ba98] k4=[----] w=[fedc jihg nmlk rqpo] b=[3210 7654 ba98 fedc] k1'=[jihg] k2'=[nmlk] k3'=[rqpo] */
+    for( ; ptr < end ; ptr+=16) {
+      k4 = READ_UINT32(ptr, 0);
+      doblock128x86(h1, h2, h3, h4, k1, k2, k3, k4);
+      k1 = READ_UINT32(ptr, 1);
+      k2 = READ_UINT32(ptr, 2);
+      k3 = READ_UINT32(ptr, 3);
+    }
+    break;
+  default: /* 12 < n <= 15
+  k1=[3210] k2=[7654] k3=[ba98] k4=[dc--] w=[hgfe lkji ponm tsrq] b=[3210 7654 ba98 fedc] k1'=[jihg] k2'=[nmlk] k3'=[rqpo] k3'=[ts--] */
+    {
+      const int lshift = n*8-96, rshift = 32-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint32_t c = k4>>rshift;      // --dc
+        k4 = READ_UINT32(ptr, 0);     // hgfe
+        c |= k4<<lshift;              // fedc.
+        doblock128x86(h1, h2, h3, h4, k1, k2, k3, c);
+        k3 = READ_UINT32(ptr, 1);     // lkji
+        k1 = k3<<lshift | k4>>rshift; // jihg.
+        c  = READ_UINT32(ptr, 2);     // ponm
+        k2 = c<<lshift | k3>>rshift;  // nmlk.
+        k4 = READ_UINT32(ptr, 3);     // tsrq.
+        k3 = k4<<lshift | c>>rshift;  // rqpo.
+      }
+    }
+  }
+#endif /*UNALIGNED_SAFE*/
+
+  /* Advance over whole 128-bit chunks, possibly leaving 1..15 bytes */
+  len -= len & ~15;
+
+  /* Append any remaining bytes into carry */
+  dobytes128x86(len, h1, h2, h3, h4, k1, k2, k3, k4, n, ptr, len);
+
+  /* Copy out new running hash and carry */
+  ph[0] = h1;
+  ph[1] = h2;
+  ph[2] = h3;
+  ph[3] = h4;
+  pcarry[0] = k1;
+  pcarry[1] = k2;
+  pcarry[2] = k3;
+  pcarry[3] = (k4 & ~0xff) | n;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* All in one go */
+
+/* MurmurHash3_x86_128 api */
+void PMurHash128x86(const void * key, const int len, uint32_t seed, void * out)
+{
+  uint32_t carry[4] = {0, 0, 0, 0};
+  uint32_t h[4] = {seed, seed, seed, seed};
+  PMurHash128x86_Process(h, carry, key, len);
+  PMurHash128x86_Result(h, carry, (uint32_t) len, (uint32_t *) out);
+}
+
+/*-----------------------------------------------------------------------------*
+                                 PMurHash128x64
+ *-----------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------------------
+ * Core murmurhash algorithm macros */
+
+static const uint64_t kC1L = BIG_CONSTANT(0x87c37b91114253d5);
+static const uint64_t kC2L = BIG_CONSTANT(0x4cf5ad432745937f);
+
+/* This is the main processing body of the algorithm. It operates
+ * on each full 128-bits of input. */
+#define doblock128x64(h1, h2, k1, k2)\
+do {\
+  k1 *= kC1L; k1  = ROTL64(k1,31); k1 *= kC2L; h1 ^= k1;\
+\
+  h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;\
+\
+  k2 *= kC2L; k2  = ROTL64(k2,33); k2 *= kC1L; h2 ^= k2;\
+\
+  h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;\
+} while(0)
+
+/* Append unaligned bytes to carry, forcing hash churn if we have 16 bytes */
+/* cnt=bytes to process, h1,h2=hash k1,k2=carry, n=bytes in carry, ptr/len=payload */
+#define dobytes128x64(cnt, h1, h2, k1, k2, n, ptr, len) \
+do {\
+  unsigned __cnt = cnt;\
+  for(;__cnt--; len--) {\
+    switch(n) {\
+      case  0: case  1: case  2: case  3:\
+      case  4: case  5: case  6: case  7:\
+        k1 = k1>>8 | (uint64_t)*ptr++<<56;\
+        n++; break;\
+\
+      case  8: case  9: case 10: case 11:\
+      case 12: case 13: case 14:\
+        k2 = k2>>8 | (uint64_t)*ptr++<<56;\
+        n++; break;\
+\
+      case 15:\
+        k2 = k2>>8 | (uint64_t)*ptr++<<56;\
+        doblock128x64(h1, h2, k1, k2);\
+        n = 0; break;\
+    }\
+  }\
+} while(0)
+
+/* Finalize a hash. To match the original Murmur3_128x64 the total_length must be provided */
+void PMurHash128x64_Result(const uint64_t * const ph, const uint64_t * const pcarry,
+                        const uint32_t total_length, uint64_t * const out)
+{
+  uint64_t h1 = ph[0];
+  uint64_t h2 = ph[1];
+
+  uint64_t k1;
+  uint64_t k2 = pcarry[1];
+
+  int n = k2 & 15;
+  if (n) {
+    k1 = pcarry[0];
+    if (n > 8) {
+      k2 >>= (16-n)*8;
+      k2 *= kC2L; k2  = ROTL64(k2,33); k2 *= kC1L; h2 ^= k2;
+    } else {
+      k1 >>= (8-n)*8;
+    }
+    k1 *= kC1L; k1  = ROTL64(k1,31); k1 *= kC2L; h1 ^= k1;
+  }
+
+  //----------
+  // finalization
+
+  h1 ^= total_length; h2 ^= total_length;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  out[0] = h1;
+  out[1] = h2;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Main hashing function. Initialise carry[2] to {0,0} and h[2] to an initial {seed,seed}
+ * if wanted. Both ph and pcarry are required arguments. */
+void PMurHash128x64_Process(uint64_t * const ph, uint64_t * const pcarry, const void * const key, int len)
+{
+  uint64_t h1 = ph[0];
+  uint64_t h2 = ph[1];
+
+  uint64_t k1 = pcarry[0];
+  uint64_t k2 = pcarry[1];
+
+  const uint8_t *ptr = (uint8_t*)key;
+  const uint8_t *end;
+
+  /* Extract carry count from low 4 bits of c value */
+  int n = k2 & 15;
+
+#if defined(UNALIGNED_SAFE)
+  /* This CPU handles unaligned word access */
+// #pragma message ( "UNALIGNED_SAFE" )
+  /* Consume any carry bytes */
+  int i = (16-n) & 15;
+  if(i && i <= len) {
+    dobytes128x64(i, h1, h2, k1, k2, n, ptr, len);
+  }
+
+  /* Process 128-bit chunks */
+  end = ptr + (len & ~15);
+  for( ; ptr < end ; ptr+=16) {
+    k1 = READ_UINT64(ptr, 0);
+    k2 = READ_UINT64(ptr, 1);
+    doblock128x64(h1, h2, k1, k2);
+  }
+
+#else /*UNALIGNED_SAFE*/
+  /* This CPU does not handle unaligned word access */
+// #pragma message ( "ALIGNED" )
+  /* Consume enough so that the next data byte is word aligned */
+  int i = -(intptr_t)(void *)ptr & 7;
+  if(i && i <= len) {
+    dobytes128x64(i, h1, h2, k1, k2, n, ptr, len);
+  }
+  /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */
+  end = ptr + (len & ~15);
+
+  switch(n) { /* how many bytes in c */
+  case 0: /*
+    k1=[--------] k2=[--------] w=[76543210 fedcba98] b=[76543210 fedcba98] */
+    for( ; ptr < end ; ptr+=16) {
+      k1 = READ_UINT64(ptr, 0);
+      k2 = READ_UINT64(ptr, 1);
+      doblock128x64(h1, h2, k1, k2);
+    }
+    break;
+  case 1: case 2: case 3: case 4: case 5: case 6: case 7: /*
+    k1=[10------] k2=[--------] w=[98765432 hgfedcba] b=[76543210 fedcba98] k1'=[hg------] */
+    {
+      const int lshift = n*8, rshift = 64-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint64_t c = k1>>rshift;
+        k2 = READ_UINT64(ptr, 0);
+        c |= k2<<lshift;
+        k1 = READ_UINT64(ptr, 1);
+        k2 = k2>>rshift | k1<<lshift;
+        doblock128x64(h1, h2, c, k2);
+      }
+    }
+    break;
+  case 8: /*
+  k1=[76543210] k2=[--------] w=[fedcba98 nmlkjihg] b=[76543210 fedcba98] k1`=[nmlkjihg] */
+    for( ; ptr < end ; ptr+=16) {
+      k2 = READ_UINT64(ptr, 0);
+      doblock128x64(h1, h2, k1, k2);
+      k1 = READ_UINT64(ptr, 1);
+    }
+    break;
+  default: /* 8 < n <= 15
+  k1=[76543210] k2=[98------] w=[hgfedcba ponmlkji] b=[76543210 fedcba98] k1`=[nmlkjihg] k2`=[po------] */
+    {
+      const int lshift = n*8-64, rshift = 64-lshift;
+      for( ; ptr < end ; ptr+=16) {
+        uint64_t c = k2 >> rshift;
+        k2 = READ_UINT64(ptr, 0);
+        c |= k2 << lshift;
+        doblock128x64(h1, h2, k1, c);
+        k1 = k2 >> rshift;
+        k2 = READ_UINT64(ptr, 1);
+        k1 |= k2 << lshift;
+      }
+    }
+  }
+#endif /*UNALIGNED_SAFE*/
+
+  /* Advance over whole 128-bit chunks, possibly leaving 1..15 bytes */
+  len -= len & ~15;
+
+  /* Append any remaining bytes into carry */
+  dobytes128x64(len, h1, h2, k1, k2, n, ptr, len);
+
+  /* Copy out new running hash and carry */
+  ph[0] = h1;
+  ph[1] = h2;
+  pcarry[0] = k1;
+  pcarry[1] = (k2 & ~0xff) | n;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* All in one go */
+
+/* MurmurHash3_x64_128 api */
+void PMurHash128x64(const void * key, const int len, uint32_t seed, void * out)
+{
+  uint64_t carry[2] = {0, 0};
+  uint64_t h[2] = {seed, seed};
+  PMurHash128x64_Process(h, carry, key, len);
+  PMurHash128x64_Result(h, carry, (uint32_t) len, (uint64_t *) out);
+}