From 965971d5554693ac7d67704bc69835cfc5bd6def Mon Sep 17 00:00:00 2001 From: coreyjjames Date: Tue, 12 Nov 2019 14:20:08 -0500 Subject: Add Aarch64 Support --- configure.ac | 14 ++- src/libFLAC/Makefile.am | 1 + src/libFLAC/cpu.c | 3 + src/libFLAC/include/private/cpu.h | 1 + src/libFLAC/include/private/lpc.h | 6 + src/libFLAC/lpc_intrin_neon.c | 234 ++++++++++++++++++++++++++++++++++++++ src/libFLAC/stream_encoder.c | 12 ++ 7 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 src/libFLAC/lpc_intrin_neon.c diff --git a/configure.ac b/configure.ac index 5eb6b9af..ea08f698 100644 --- a/configure.ac +++ b/configure.ac @@ -69,7 +69,7 @@ AC_C_INLINE AC_C_VARARRAYS AC_C_TYPEOF -AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h]) +AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h]) XIPH_C_BSWAP32 XIPH_C_BSWAP16 @@ -156,6 +156,11 @@ case "$host_cpu" in AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC]) asm_optimisation=$asm_opt ;; + aarch64*) + cpu_aarch64=true + AC_DEFINE(FLAC__CPU_AARCH64) + AH_TEMPLATE(FLAC__CPU_AARCH64, [define if building for AARCH64]) + ;; sparc) cpu_sparc=true AC_DEFINE(FLAC__CPU_SPARC) @@ -167,6 +172,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue) AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue) AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue) AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue) +AM_CONDITIONAL(FLAC__CPU_AARCH64, test "x$cpu_aarch64" = xtrue) AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue) if test "x$ac_cv_header_x86intrin_h" = xyes; then @@ -175,6 +181,12 @@ else AC_DEFINE([FLAC__HAS_X86INTRIN], 0) fi +if test "x$ac_cv_header_arm_neon_h" = xyes; then +AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if is available.]) +else +AC_DEFINE([FLAC__HAS_NEONINTRIN], 0) +fi + if test x"$cpu_ppc64" = xtrue ; then AC_C_ATTRIBUTE([target("cpu=power8")], diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index 468939d5..30b40ac2 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -116,6 +116,7 @@ libFLAC_sources = \ lpc_intrin_sse41.c \ lpc_intrin_avx2.c \ lpc_intrin_vsx.c \ + lpc_intrin_neon.c \ md5.c \ memory.c \ metadata_iterators.c \ diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c index c90b8999..9a2e3174 100644 --- a/src/libFLAC/cpu.c +++ b/src/libFLAC/cpu.c @@ -279,6 +279,8 @@ void FLAC__cpu_info (FLAC__CPUInfo *info) info->type = FLAC__CPUINFO_TYPE_X86_64; #elif defined FLAC__CPU_PPC info->type = FLAC__CPUINFO_TYPE_PPC; +#elif defined FLAC__CPU_AARCH64 + info->type = FLAC__CPUINFO_TYPE_AARCH64; #else info->type = FLAC__CPUINFO_TYPE_UNKNOWN; #endif @@ -291,6 +293,7 @@ void FLAC__cpu_info (FLAC__CPUInfo *info) case FLAC__CPUINFO_TYPE_PPC: ppc_cpu_info (info); break; + case FLAC__CPUINFO_TYPE_AARCH64: /* fallthrough */ default: info->use_asm = false; break; diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index fc31350e..60f071fb 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -154,6 +154,7 @@ typedef enum { FLAC__CPUINFO_TYPE_IA32, FLAC__CPUINFO_TYPE_X86_64, FLAC__CPUINFO_TYPE_PPC, + FLAC__CPUINFO_TYPE_AARCH64, FLAC__CPUINFO_TYPE_UNKNOWN } FLAC__CPUInfo_Type; diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 64dfd1f8..e73efc3b 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -105,6 +105,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); #endif #endif +#ifdef FLAC__CPU_AARCH64 +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +#endif #endif /* diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c new file mode 100644 index 00000000..80b962c8 --- /dev/null +++ b/src/libFLAC/lpc_intrin_neon.c @@ -0,0 +1,234 @@ +#include "private/cpu.h" + +#ifndef FLAC__INTEGER_ONLY_LIBRARY +#ifndef FLAC__NO_ASM +#if defined FLAC__CPU_AARCH64 && FLAC__HAS_NEONINTRIN +#include "private/lpc.h" +#include "FLAC/assert.h" +#include "FLAC/format.h" +#include + +inline float32x4_t shufffleVector(float32x4_t vec) +{ + float32x2_t hi = vget_high_f32(vec); + float32x2_t lo = vget_low_f32(vec); + float32x2x2_t qr0_z = vzip_f32(hi, lo); + return vcombine_f32(qr0_z.val[0], qr0_z.val[1]); +} + +inline float32x4_t shufffleVector_2103(float32x4_t vec) +{ + float32_t *tempPtr; + float32_t temp = vgetq_lane_f32(vec, 0); + + vec = vcopyq_laneq_f32(vec, 0, vec, 2); + tempPtr = &temp; + vst1q_lane_f32(tempPtr, vec, 2); + return vec; +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + int i; + int limit = data_len - 4; + float32x4_t sum0 = vdupq_n_f32(0.0f); + + (void)lag; + FLAC__ASSERT(lag <= 4); + FLAC__ASSERT(lag <= data_len); + + for (i = 0; i <= limit; i++) + { + float32x4_t d, d0; + d0 = vld1q_f32(data + i); + d = shufffleVector(d0); + sum0 = vaddq_f32(sum0, vmulq_f32(d0, d)); + } + + { + float32x4_t d0 = vdupq_n_f32(0.0f); + limit++; + if (limit < 0) + limit = 0; + + for (i = data_len - 1; i >= limit; i--) + { + float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0); + + d = shufffleVector(d); + + d0 = shufffleVector_2103(d0); + d0 = vcopyq_laneq_f32(d0, 0, d, 0); + sum0 = vaddq_f32(sum0, vmulq_f32(d, d0)); + } + } + vst1q_f32(autoc, sum0); +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + int i; + int limit = data_len - 8; + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + + (void)lag; + FLAC__ASSERT(lag <= 8); + FLAC__ASSERT(lag <= data_len); + + for (i = 0; i <= limit; i++) + { + float32x4_t d, d0, d1; + d0 = vld1q_f32(data + i); + d1 = vld1q_f32(data + i + 4); + d = shufffleVector(d0); + sum0 = vaddq_f32(sum0, vmulq_f32(d0, d)); + sum1 = vaddq_f32(sum1, vmulq_f32(d1, d)); + } + + { + float32x4_t d0 = vdupq_n_f32(0.0f); + float32x4_t d1 = vdupq_n_f32(0.0f); + limit++; + if (limit < 0) + limit = 0; + + for (i = data_len - 1; i >= limit; i--) + { + float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0); + d = shufffleVector(d); + + d1 = shufffleVector_2103(d1); + d0 = shufffleVector_2103(d0); + + d1 = vcopyq_laneq_f32(d1, 0, d0, 0); + d0 = vcopyq_laneq_f32(d0, 0, d, 0); + + sum1 = vaddq_f32(sum1, vmulq_f32(d, d1)); + sum0 = vaddq_f32(sum0, vmulq_f32(d, d0)); + } + } + vst1q_f32(autoc, sum0); + vst1q_f32(autoc + 4, sum1); +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + int i; + int limit = data_len - 12; + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + + (void)lag; + FLAC__ASSERT(lag <= 12); + FLAC__ASSERT(lag <= data_len); + + for (i = 0; i <= limit; i++) + { + float32x4_t d, d0, d1, d2; + d0 = vld1q_f32(data + i); + d1 = vld1q_f32(data + i + 4); + d1 = vld1q_f32(data + i + 8); + d = shufffleVector(d0); + sum0 = vaddq_f32(sum0, vmulq_f32(d0, d)); + sum1 = vaddq_f32(sum1, vmulq_f32(d1, d)); + sum2 = vaddq_f32(sum2, vmulq_f32(d2, d)); + } + + { + float32x4_t d0 = vdupq_n_f32(0.0f); + float32x4_t d1 = vdupq_n_f32(0.0f); + float32x4_t d2 = vdupq_n_f32(0.0f); + limit++; + if (limit < 0) + limit = 0; + + for (i = data_len - 1; i >= limit; i--) + { + float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0); + d = shufffleVector(d); + + d2 = shufffleVector_2103(d2); + d1 = shufffleVector_2103(d1); + d0 = shufffleVector_2103(d0); + + d2 = vcopyq_laneq_f32(d2, 0, d1, 0); + d1 = vcopyq_laneq_f32(d1, 0, d0, 0); + d0 = vcopyq_laneq_f32(d0, 0, d, 0); + + sum2 = vaddq_f32(sum2, vmulq_f32(d, d2)); + sum1 = vaddq_f32(sum1, vmulq_f32(d, d1)); + sum0 = vaddq_f32(sum0, vmulq_f32(d, d0)); + } + } + vst1q_f32(autoc, sum0); + vst1q_f32(autoc + 4, sum1); + vst1q_f32(autoc + 8, sum2); +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + int i; + int limit = data_len - 16; + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + (void)lag; + FLAC__ASSERT(lag <= 16); + FLAC__ASSERT(lag <= data_len); + + for (i = 0; i <= limit; i++) + { + float32x4_t d, d0, d1, d2, d3; + d0 = vld1q_f32(data + i); + d1 = vld1q_f32(data + i + 4); + d1 = vld1q_f32(data + i + 8); + d1 = vld1q_f32(data + i + 12); + d = shufffleVector(d0); + sum0 = vaddq_f32(sum0, vmulq_f32(d0, d)); + sum1 = vaddq_f32(sum1, vmulq_f32(d1, d)); + sum2 = vaddq_f32(sum2, vmulq_f32(d2, d)); + sum3 = vaddq_f32(sum3, vmulq_f32(d3, d)); + } + + { + float32x4_t d0 = vdupq_n_f32(0.0f); + float32x4_t d1 = vdupq_n_f32(0.0f); + float32x4_t d2 = vdupq_n_f32(0.0f); + float32x4_t d3 = vdupq_n_f32(0.0f); + limit++; + if (limit < 0) + limit = 0; + + for (i = data_len - 1; i >= limit; i--) + { + float32x4_t d = vld1q_lane_f32(data + i, vdupq_n_f32(0.0f), 0); + d = shufffleVector(d); + + d3 = shufffleVector_2103(d3); + d2 = shufffleVector_2103(d2); + d1 = shufffleVector_2103(d1); + d0 = shufffleVector_2103(d0); + + d3 = vcopyq_laneq_f32(d3, 0, d2, 0); + d2 = vcopyq_laneq_f32(d2, 0, d1, 0); + d1 = vcopyq_laneq_f32(d1, 0, d0, 0); + d0 = vcopyq_laneq_f32(d0, 0, d, 0); + + sum3 = vaddq_f32(sum3, vmulq_f32(d, d3)); + sum2 = vaddq_f32(sum2, vmulq_f32(d, d2)); + sum1 = vaddq_f32(sum1, vmulq_f32(d, d1)); + sum0 = vaddq_f32(sum0, vmulq_f32(d, d0)); + } + } + vst1q_f32(autoc, sum0); + vst1q_f32(autoc + 4, sum1); + vst1q_f32(autoc + 8, sum2); + vst1q_f32(autoc + 12, sum3); +} +#endif /* FLAC__CPU_AARCH64 && FLAC__HAS_ARCH64INTRIN */ +#endif /* FLAC__NO_ASM */ +#endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 74387ec3..cda606f4 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -914,6 +914,18 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; } #endif +#endif +#if defined FLAC__CPU_AARCH64 + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_4; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_12; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_16; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; #endif if(encoder->private_->cpuinfo.use_asm) { # ifdef FLAC__CPU_IA32 -- cgit v1.2.1