From 965971d5554693ac7d67704bc69835cfc5bd6def Mon Sep 17 00:00:00 2001
From: coreyjjames <cjames29@myseneca.ca>
Date: Tue, 12 Nov 2019 14:20:08 -0500
Subject: Add Aarch64 Support

---
 configure.ac                      |  14 ++-
 src/libFLAC/Makefile.am           |   1 +
 src/libFLAC/cpu.c                 |   3 +
 src/libFLAC/include/private/cpu.h |   1 +
 src/libFLAC/include/private/lpc.h |   6 +
 src/libFLAC/lpc_intrin_neon.c     | 234 ++++++++++++++++++++++++++++++++++++++
 src/libFLAC/stream_encoder.c      |  12 ++
 7 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 src/libFLAC/lpc_intrin_neon.c

diff --git a/configure.ac b/configure.ac
index 5eb6b9af..ea08f698 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,7 +69,7 @@ AC_C_INLINE
 AC_C_VARARRAYS
 AC_C_TYPEOF
 
-AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h])
+AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
 
 XIPH_C_BSWAP32
 XIPH_C_BSWAP16
@@ -156,6 +156,11 @@ case "$host_cpu" in
 		AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
 		asm_optimisation=$asm_opt
 		;;
+	aarch64*)
+		cpu_aarch64=true
+		AC_DEFINE(FLAC__CPU_AARCH64)
+		AH_TEMPLATE(FLAC__CPU_AARCH64, [define if building for AARCH64])
+		;;
 	sparc)
 		cpu_sparc=true
 		AC_DEFINE(FLAC__CPU_SPARC)
@@ -167,6 +172,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
+AM_CONDITIONAL(FLAC__CPU_AARCH64, test "x$cpu_aarch64" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
 
 if test "x$ac_cv_header_x86intrin_h" = xyes; then
@@ -175,6 +181,12 @@ else
 AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
 fi
 
+if test "x$ac_cv_header_arm_neon_h" = xyes; then
+AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if <arm_neon.h> is available.])
+else
+AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
+fi
+
 if test x"$cpu_ppc64" = xtrue ; then
 
 AC_C_ATTRIBUTE([target("cpu=power8")],
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am
index 468939d5..30b40ac2 100644
--- a/src/libFLAC/Makefile.am
+++ b/src/libFLAC/Makefile.am
@@ -116,6 +116,7 @@ libFLAC_sources = \
 	lpc_intrin_sse41.c \
 	lpc_intrin_avx2.c \
 	lpc_intrin_vsx.c \
+	lpc_intrin_neon.c \
 	md5.c \
 	memory.c \
 	metadata_iterators.c \
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index c90b8999..9a2e3174 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -279,6 +279,8 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
 	info->type = FLAC__CPUINFO_TYPE_X86_64;
 #elif defined FLAC__CPU_PPC
 	info->type = FLAC__CPUINFO_TYPE_PPC;
+#elif defined FLAC__CPU_AARCH64
+	info->type = FLAC__CPUINFO_TYPE_AARCH64;
 #else
 	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
 #endif
@@ -291,6 +293,7 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
 	case FLAC__CPUINFO_TYPE_PPC:
 		ppc_cpu_info (info);
 		break;
+	case FLAC__CPUINFO_TYPE_AARCH64: /* fallthrough */
 	default:
 		info->use_asm = false;
 		break;
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index fc31350e..60f071fb 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -154,6 +154,7 @@ typedef enum {
 	FLAC__CPUINFO_TYPE_IA32,
 	FLAC__CPUINFO_TYPE_X86_64,
 	FLAC__CPUINFO_TYPE_PPC,
+	FLAC__CPUINFO_TYPE_AARCH64,
 	FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 64dfd1f8..e73efc3b 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -105,6 +105,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real
 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
 #endif
 #endif
+#ifdef FLAC__CPU_AARCH64
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+#endif
 #endif
 
 /*
diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c
new file mode 100644
index 00000000..80b962c8
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_neon.c
@@ -0,0 +1,234 @@
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if defined FLAC__CPU_AARCH64 && FLAC__HAS_NEONINTRIN
+#include "private/lpc.h"
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+#include <arm_neon.h>
+
+inline float32x4_t shufffleVector(float32x4_t vec)
+{
+    float32x2_t hi = vget_high_f32(vec);
+    float32x2_t lo = vget_low_f32(vec);
+    float32x2x2_t qr0_z = vzip_f32(hi, lo);
+    return vcombine_f32(qr0_z.val[0], qr0_z.val[1]);
+}
+
+inline float32x4_t shufffleVector_2103(float32x4_t vec)
+{
+    float32_t *tempPtr;
+    float32_t temp = vgetq_lane_f32(vec, 0);
+
+    vec = vcopyq_laneq_f32(vec, 0, vec, 2);
+    tempPtr = &temp;
+    vst1q_lane_f32(tempPtr, vec, 2);
+    return vec;
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+    int i;
+    int limit = data_len - 4;
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+
+    (void)lag;
+    FLAC__ASSERT(lag <= 4);
+    FLAC__ASSERT(lag <= data_len);
+
+    for (i = 0; i <= limit; i++)
+    {
+        float32x4_t d, d0;
+        d0 = vld1q_f32(data + i);
+        d = shufffleVector(d0);
+        sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+    }
+
+    {
+        float32x4_t d0 = vdupq_n_f32(0.0f);
+        limit++;
+        if (limit < 0)
+            limit = 0;
+
+        for (i = data_len - 1; i >= limit; i--)
+        {
+            float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+
+            d = shufffleVector(d);
+
+            d0 = shufffleVector_2103(d0);
+            d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+            sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+        }
+    }
+    vst1q_f32(autoc, sum0);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+    int i;
+    int limit = data_len - 8;
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+
+    (void)lag;
+    FLAC__ASSERT(lag <= 8);
+    FLAC__ASSERT(lag <= data_len);
+
+    for (i = 0; i <= limit; i++)
+    {
+        float32x4_t d, d0, d1;
+        d0 = vld1q_f32(data + i);
+        d1 = vld1q_f32(data + i + 4);
+        d = shufffleVector(d0);
+        sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+        sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+    }
+
+    {
+        float32x4_t d0 = vdupq_n_f32(0.0f);
+        float32x4_t d1 = vdupq_n_f32(0.0f);
+        limit++;
+        if (limit < 0)
+            limit = 0;
+
+        for (i = data_len - 1; i >= limit; i--)
+        {
+            float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+            d = shufffleVector(d);
+
+            d1 = shufffleVector_2103(d1);
+            d0 = shufffleVector_2103(d0);
+
+            d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+            d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+            sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+            sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+        }
+    }
+    vst1q_f32(autoc, sum0);
+    vst1q_f32(autoc + 4, sum1);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+    int i;
+    int limit = data_len - 12;
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+
+    (void)lag;
+    FLAC__ASSERT(lag <= 12);
+    FLAC__ASSERT(lag <= data_len);
+
+    for (i = 0; i <= limit; i++)
+    {
+        float32x4_t d, d0, d1, d2;
+        d0 = vld1q_f32(data + i);
+        d1 = vld1q_f32(data + i + 4);
+        d1 = vld1q_f32(data + i + 8);
+        d = shufffleVector(d0);
+        sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+        sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+        sum2 = vaddq_f32(sum2, vmulq_f32(d2, d));
+    }
+
+    {
+        float32x4_t d0 = vdupq_n_f32(0.0f);
+        float32x4_t d1 = vdupq_n_f32(0.0f);
+        float32x4_t d2 = vdupq_n_f32(0.0f);
+        limit++;
+        if (limit < 0)
+            limit = 0;
+
+        for (i = data_len - 1; i >= limit; i--)
+        {
+            float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+            d = shufffleVector(d);
+
+            d2 = shufffleVector_2103(d2);
+            d1 = shufffleVector_2103(d1);
+            d0 = shufffleVector_2103(d0);
+
+            d2 = vcopyq_laneq_f32(d2, 0, d1, 0);
+            d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+            d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+            sum2 = vaddq_f32(sum2, vmulq_f32(d, d2));
+            sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+            sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+        }
+    }
+    vst1q_f32(autoc, sum0);
+    vst1q_f32(autoc + 4, sum1);
+    vst1q_f32(autoc + 8, sum2);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+    int i;
+    int limit = data_len - 16;
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    (void)lag;
+    FLAC__ASSERT(lag <= 16);
+    FLAC__ASSERT(lag <= data_len);
+
+    for (i = 0; i <= limit; i++)
+    {
+        float32x4_t d, d0, d1, d2, d3;
+        d0 = vld1q_f32(data + i);
+        d1 = vld1q_f32(data + i + 4);
+        d1 = vld1q_f32(data + i + 8);
+        d1 = vld1q_f32(data + i + 12);
+        d = shufffleVector(d0);
+        sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+        sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+        sum2 = vaddq_f32(sum2, vmulq_f32(d2, d));
+        sum3 = vaddq_f32(sum3, vmulq_f32(d3, d));
+    }
+
+    {
+        float32x4_t d0 = vdupq_n_f32(0.0f);
+        float32x4_t d1 = vdupq_n_f32(0.0f);
+        float32x4_t d2 = vdupq_n_f32(0.0f);
+        float32x4_t d3 = vdupq_n_f32(0.0f);
+        limit++;
+        if (limit < 0)
+            limit = 0;
+
+        for (i = data_len - 1; i >= limit; i--)
+        {
+            float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+            d = shufffleVector(d);
+
+            d3 = shufffleVector_2103(d3);
+            d2 = shufffleVector_2103(d2);
+            d1 = shufffleVector_2103(d1);
+            d0 = shufffleVector_2103(d0);
+
+            d3 = vcopyq_laneq_f32(d3, 0, d2, 0);
+            d2 = vcopyq_laneq_f32(d2, 0, d1, 0);
+            d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+            d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+            sum3 = vaddq_f32(sum3, vmulq_f32(d, d3));
+            sum2 = vaddq_f32(sum2, vmulq_f32(d, d2));
+            sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+            sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+        }
+    }
+    vst1q_f32(autoc, sum0);
+    vst1q_f32(autoc + 4, sum1);
+    vst1q_f32(autoc + 8, sum2);
+    vst1q_f32(autoc + 12, sum3);
+}
+#endif /* FLAC__CPU_AARCH64 && FLAC__HAS_ARCH64INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index 74387ec3..cda606f4 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -914,6 +914,18 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
 	}
 #endif
+#endif
+#if defined FLAC__CPU_AARCH64
+	if(encoder->protected_->max_lpc_order < 4)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4;
+		else if(encoder->protected_->max_lpc_order < 8)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8;
+		else if(encoder->protected_->max_lpc_order < 12)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12;
+		else if(encoder->protected_->max_lpc_order < 16)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16;
+		else
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
 #endif
 	if(encoder->private_->cpuinfo.use_asm) {
 #  ifdef FLAC__CPU_IA32
-- 
cgit v1.2.1