summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcoreyjjames <cjames29@myseneca.ca>2019-11-12 14:20:08 -0500
committerErik de Castro Lopo <erikd@mega-nerd.com>2020-01-05 11:29:59 +1100
commit965971d5554693ac7d67704bc69835cfc5bd6def (patch)
tree9319f5b39fac2e9883c801bb98588ea21d384d67
parentcffe3890c71455834b750fa467d6667d2e9e4547 (diff)
downloadflac-965971d5554693ac7d67704bc69835cfc5bd6def.tar.gz
Add Aarch64 Support
-rw-r--r--configure.ac14
-rw-r--r--src/libFLAC/Makefile.am1
-rw-r--r--src/libFLAC/cpu.c3
-rw-r--r--src/libFLAC/include/private/cpu.h1
-rw-r--r--src/libFLAC/include/private/lpc.h6
-rw-r--r--src/libFLAC/lpc_intrin_neon.c234
-rw-r--r--src/libFLAC/stream_encoder.c12
7 files changed, 270 insertions, 1 deletions
diff --git a/configure.ac b/configure.ac
index 5eb6b9af..ea08f698 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,7 +69,7 @@ AC_C_INLINE
AC_C_VARARRAYS
AC_C_TYPEOF
-AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h])
+AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
XIPH_C_BSWAP32
XIPH_C_BSWAP16
@@ -156,6 +156,11 @@ case "$host_cpu" in
AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
asm_optimisation=$asm_opt
;;
+ aarch64*)
+ cpu_aarch64=true
+ AC_DEFINE(FLAC__CPU_AARCH64)
+ AH_TEMPLATE(FLAC__CPU_AARCH64, [define if building for AARCH64])
+ ;;
sparc)
cpu_sparc=true
AC_DEFINE(FLAC__CPU_SPARC)
@@ -167,6 +172,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
+AM_CONDITIONAL(FLAC__CPU_AARCH64, test "x$cpu_aarch64" = xtrue)
AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
if test "x$ac_cv_header_x86intrin_h" = xyes; then
@@ -175,6 +181,12 @@ else
AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
fi
+if test "x$ac_cv_header_arm_neon_h" = xyes; then
+AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if <arm_neon.h> is available.])
+else
+AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
+fi
+
if test x"$cpu_ppc64" = xtrue ; then
AC_C_ATTRIBUTE([target("cpu=power8")],
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am
index 468939d5..30b40ac2 100644
--- a/src/libFLAC/Makefile.am
+++ b/src/libFLAC/Makefile.am
@@ -116,6 +116,7 @@ libFLAC_sources = \
lpc_intrin_sse41.c \
lpc_intrin_avx2.c \
lpc_intrin_vsx.c \
+ lpc_intrin_neon.c \
md5.c \
memory.c \
metadata_iterators.c \
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index c90b8999..9a2e3174 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -279,6 +279,8 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
info->type = FLAC__CPUINFO_TYPE_X86_64;
#elif defined FLAC__CPU_PPC
info->type = FLAC__CPUINFO_TYPE_PPC;
+#elif defined FLAC__CPU_AARCH64
+ info->type = FLAC__CPUINFO_TYPE_AARCH64;
#else
info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
#endif
@@ -291,6 +293,7 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
case FLAC__CPUINFO_TYPE_PPC:
ppc_cpu_info (info);
break;
+ case FLAC__CPUINFO_TYPE_AARCH64: /* fallthrough */
default:
info->use_asm = false;
break;
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index fc31350e..60f071fb 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -154,6 +154,7 @@ typedef enum {
FLAC__CPUINFO_TYPE_IA32,
FLAC__CPUINFO_TYPE_X86_64,
FLAC__CPUINFO_TYPE_PPC,
+ FLAC__CPUINFO_TYPE_AARCH64,
FLAC__CPUINFO_TYPE_UNKNOWN
} FLAC__CPUInfo_Type;
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 64dfd1f8..e73efc3b 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -105,6 +105,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
#endif
#endif
+#ifdef FLAC__CPU_AARCH64
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]);
+#endif
#endif
/*
diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c
new file mode 100644
index 00000000..80b962c8
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_neon.c
@@ -0,0 +1,234 @@
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if defined FLAC__CPU_AARCH64 && FLAC__HAS_NEONINTRIN
+#include "private/lpc.h"
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+#include <arm_neon.h>
+
+inline float32x4_t shufffleVector(float32x4_t vec)
+{
+ float32x2_t hi = vget_high_f32(vec);
+ float32x2_t lo = vget_low_f32(vec);
+ float32x2x2_t qr0_z = vzip_f32(hi, lo);
+ return vcombine_f32(qr0_z.val[0], qr0_z.val[1]);
+}
+
+inline float32x4_t shufffleVector_2103(float32x4_t vec)
+{
+ float32_t *tempPtr;
+ float32_t temp = vgetq_lane_f32(vec, 0);
+
+ vec = vcopyq_laneq_f32(vec, 0, vec, 2);
+ tempPtr = &temp;
+ vst1q_lane_f32(tempPtr, vec, 2);
+ return vec;
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+ int i;
+ int limit = data_len - 4;
+ float32x4_t sum0 = vdupq_n_f32(0.0f);
+
+ (void)lag;
+ FLAC__ASSERT(lag <= 4);
+ FLAC__ASSERT(lag <= data_len);
+
+ for (i = 0; i <= limit; i++)
+ {
+ float32x4_t d, d0;
+ d0 = vld1q_f32(data + i);
+ d = shufffleVector(d0);
+ sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+ }
+
+ {
+ float32x4_t d0 = vdupq_n_f32(0.0f);
+ limit++;
+ if (limit < 0)
+ limit = 0;
+
+ for (i = data_len - 1; i >= limit; i--)
+ {
+ float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+
+ d = shufffleVector(d);
+
+ d0 = shufffleVector_2103(d0);
+ d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+ sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+ }
+ }
+ vst1q_f32(autoc, sum0);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+ int i;
+ int limit = data_len - 8;
+ float32x4_t sum0 = vdupq_n_f32(0.0f);
+ float32x4_t sum1 = vdupq_n_f32(0.0f);
+
+ (void)lag;
+ FLAC__ASSERT(lag <= 8);
+ FLAC__ASSERT(lag <= data_len);
+
+ for (i = 0; i <= limit; i++)
+ {
+ float32x4_t d, d0, d1;
+ d0 = vld1q_f32(data + i);
+ d1 = vld1q_f32(data + i + 4);
+ d = shufffleVector(d0);
+ sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+ sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+ }
+
+ {
+ float32x4_t d0 = vdupq_n_f32(0.0f);
+ float32x4_t d1 = vdupq_n_f32(0.0f);
+ limit++;
+ if (limit < 0)
+ limit = 0;
+
+ for (i = data_len - 1; i >= limit; i--)
+ {
+ float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+ d = shufffleVector(d);
+
+ d1 = shufffleVector_2103(d1);
+ d0 = shufffleVector_2103(d0);
+
+ d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+ d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+ sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+ sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+ }
+ }
+ vst1q_f32(autoc, sum0);
+ vst1q_f32(autoc + 4, sum1);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+ int i;
+ int limit = data_len - 12;
+ float32x4_t sum0 = vdupq_n_f32(0.0f);
+ float32x4_t sum1 = vdupq_n_f32(0.0f);
+ float32x4_t sum2 = vdupq_n_f32(0.0f);
+
+ (void)lag;
+ FLAC__ASSERT(lag <= 12);
+ FLAC__ASSERT(lag <= data_len);
+
+ for (i = 0; i <= limit; i++)
+ {
+ float32x4_t d, d0, d1, d2;
+ d0 = vld1q_f32(data + i);
+ d1 = vld1q_f32(data + i + 4);
+ d1 = vld1q_f32(data + i + 8);
+ d = shufffleVector(d0);
+ sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+ sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+ sum2 = vaddq_f32(sum2, vmulq_f32(d2, d));
+ }
+
+ {
+ float32x4_t d0 = vdupq_n_f32(0.0f);
+ float32x4_t d1 = vdupq_n_f32(0.0f);
+ float32x4_t d2 = vdupq_n_f32(0.0f);
+ limit++;
+ if (limit < 0)
+ limit = 0;
+
+ for (i = data_len - 1; i >= limit; i--)
+ {
+ float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+ d = shufffleVector(d);
+
+ d2 = shufffleVector_2103(d2);
+ d1 = shufffleVector_2103(d1);
+ d0 = shufffleVector_2103(d0);
+
+ d2 = vcopyq_laneq_f32(d2, 0, d1, 0);
+ d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+ d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+ sum2 = vaddq_f32(sum2, vmulq_f32(d, d2));
+ sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+ sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+ }
+ }
+ vst1q_f32(autoc, sum0);
+ vst1q_f32(autoc + 4, sum1);
+ vst1q_f32(autoc + 8, sum2);
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
+{
+ int i;
+ int limit = data_len - 16;
+ float32x4_t sum0 = vdupq_n_f32(0.0f);
+ float32x4_t sum1 = vdupq_n_f32(0.0f);
+ float32x4_t sum2 = vdupq_n_f32(0.0f);
+ float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+ (void)lag;
+ FLAC__ASSERT(lag <= 16);
+ FLAC__ASSERT(lag <= data_len);
+
+ for (i = 0; i <= limit; i++)
+ {
+ float32x4_t d, d0, d1, d2, d3;
+ d0 = vld1q_f32(data + i);
+ d1 = vld1q_f32(data + i + 4);
+ d1 = vld1q_f32(data + i + 8);
+ d1 = vld1q_f32(data + i + 12);
+ d = shufffleVector(d0);
+ sum0 = vaddq_f32(sum0, vmulq_f32(d0, d));
+ sum1 = vaddq_f32(sum1, vmulq_f32(d1, d));
+ sum2 = vaddq_f32(sum2, vmulq_f32(d2, d));
+ sum3 = vaddq_f32(sum3, vmulq_f32(d3, d));
+ }
+
+ {
+ float32x4_t d0 = vdupq_n_f32(0.0f);
+ float32x4_t d1 = vdupq_n_f32(0.0f);
+ float32x4_t d2 = vdupq_n_f32(0.0f);
+ float32x4_t d3 = vdupq_n_f32(0.0f);
+ limit++;
+ if (limit < 0)
+ limit = 0;
+
+ for (i = data_len - 1; i >= limit; i--)
+ {
+ float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0);
+ d = shufffleVector(d);
+
+ d3 = shufffleVector_2103(d3);
+ d2 = shufffleVector_2103(d2);
+ d1 = shufffleVector_2103(d1);
+ d0 = shufffleVector_2103(d0);
+
+ d3 = vcopyq_laneq_f32(d3, 0, d2, 0);
+ d2 = vcopyq_laneq_f32(d2, 0, d1, 0);
+ d1 = vcopyq_laneq_f32(d1, 0, d0, 0);
+ d0 = vcopyq_laneq_f32(d0, 0, d, 0);
+
+ sum3 = vaddq_f32(sum3, vmulq_f32(d, d3));
+ sum2 = vaddq_f32(sum2, vmulq_f32(d, d2));
+ sum1 = vaddq_f32(sum1, vmulq_f32(d, d1));
+ sum0 = vaddq_f32(sum0, vmulq_f32(d, d0));
+ }
+ }
+ vst1q_f32(autoc, sum0);
+ vst1q_f32(autoc + 4, sum1);
+ vst1q_f32(autoc + 8, sum2);
+ vst1q_f32(autoc + 12, sum3);
+}
+#endif /* FLAC__CPU_AARCH64 && FLAC__HAS_ARCH64INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index 74387ec3..cda606f4 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -915,6 +915,18 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
}
#endif
#endif
+#if defined FLAC__CPU_AARCH64
+ if(encoder->protected_->max_lpc_order < 4)
+ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4;
+ else if(encoder->protected_->max_lpc_order < 8)
+ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8;
+ else if(encoder->protected_->max_lpc_order < 12)
+ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12;
+ else if(encoder->protected_->max_lpc_order < 16)
+ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16;
+ else
+ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
+#endif
if(encoder->private_->cpuinfo.use_asm) {
# ifdef FLAC__CPU_IA32
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);