Add ARM64 NEON intrinsics lpc_compute_autocorrelation routines

author: Martijn van Beurden <mvanb1@gmail.com> 2022-04-29 16:26:41 +0200
committer: Martijn van Beurden <mvanb1@gmail.com> 2022-04-29 21:44:29 +0200
commit: ef4ad99231d3b67891c0f6d21b36859c221622a7 (patch)
tree: 07e71be203416e0c4f6c29063e722657cc5c61fe /src
parent: bfe5ff9455ca62d7827e14a22fbb1522a63c4e3e (diff)
download: flac-ef4ad99231d3b67891c0f6d21b36859c221622a7.tar.gz
4 files changed, 113 insertions, 3 deletions
diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c
new file mode 100644
index 00000000..4df3aee9
--- /dev/null
+++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c
@@ -0,0 +1,70 @@
+	int i;
+	float64x2_t sum0 = vdupq_n_f64(0.0f);
+	float64x2_t sum1 = vdupq_n_f64(0.0f);
+	float64x2_t sum2 = vdupq_n_f64(0.0f);
+	float64x2_t sum3 = vdupq_n_f64(0.0f);
+	float64x2_t d0 = vdupq_n_f64(0.0f);
+	float64x2_t d1 = vdupq_n_f64(0.0f);
+	float64x2_t d2 = vdupq_n_f64(0.0f);
+	float64x2_t d3 = vdupq_n_f64(0.0f);
+#if MAX_LAG > 8
+	float64x2_t sum4 = vdupq_n_f64(0.0f);
+	float64x2_t d4 = vdupq_n_f64(0.0f);
+#endif
+#if MAX_LAG > 10
+	float64x2_t sum5 = vdupq_n_f64(0.0f);
+	float64x2_t sum6 = vdupq_n_f64(0.0f);
+	float64x2_t d5 = vdupq_n_f64(0.0f);
+	float64x2_t d6 = vdupq_n_f64(0.0f);
+#endif
+	float64x2_t d;
+
+	(void)lag;
+	FLAC__ASSERT(lag <= MAX_LAG);
+
+	// Loop backwards through samples from data_len to 0
+	for (i = data_len - 1; i >= 0; i--)
+	{
+		d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i]
+
+		// The next 6 lines of code right-shift the elements through the 7 vectors d0..d6.
+		// The 7th line adds the newly loaded element to d0. This works like a stack, where
+		// data[i] is pushed onto the stack every time and the 9th element falls off
+#if MAX_LAG > 10
+		d6 = vextq_f64(d5,d6,1);
+		d5 = vextq_f64(d4,d5,1);
+#endif
+#if MAX_LAG > 8
+		d4 = vextq_f64(d3,d4,1);
+#endif
+		d3 = vextq_f64(d2,d3,1);
+		d2 = vextq_f64(d1,d2,1);
+		d1 = vextq_f64(d0,d1,1);
+		d0 = vextq_f64(d,d0,1);
+
+		// Fused multiply-add sum += d * d0..d6
+		sum0 = vfmaq_f64(sum0, d, d0);
+		sum1 = vfmaq_f64(sum1, d, d1);
+		sum2 = vfmaq_f64(sum2, d, d2);
+		sum3 = vfmaq_f64(sum3, d, d3);
+#if MAX_LAG > 8
+		sum4 = vfmaq_f64(sum4, d, d4);
+#endif
+#if MAX_LAG > 10
+		sum5 = vfmaq_f64(sum5, d, d5);
+		sum6 = vfmaq_f64(sum6, d, d6);
+#endif
+	}
+
+    // Store sum0..sum6 in autoc[0..14]
+    vst1q_f64(autoc, sum0);
+    vst1q_f64(autoc + 2, sum1);
+    vst1q_f64(autoc + 4, sum2);
+    vst1q_f64(autoc + 6, sum3);
+#if MAX_LAG > 8
+    vst1q_f64(autoc + 8, sum4);
+#endif
+#if MAX_LAG > 10
+    vst1q_f64(autoc + 10, sum5);
+    vst1q_f64(autoc + 12, sum6);
+#endif
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index a9076903..0e619c1d 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -89,7 +89,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real
 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
 #endif
 #endif
+#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
 #endif
+#endif /* FLAC__NO_ASM */
 
 /*
  *	FLAC__lpc_compute_lp_coefficients()
diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c
index ab8f71ea..eedc2f69 100644
--- a/src/libFLAC/lpc_intrin_neon.c
+++ b/src/libFLAC/lpc_intrin_neon.c
@@ -41,6 +41,30 @@
 #include "private/macros.h"
 #include <arm_neon.h>
 
+#ifdef FLAC__HAS_A64NEONINTRIN
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
+{
+#undef MAX_LAG
+#define MAX_LAG 14
+#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
+{
+#undef MAX_LAG
+#define MAX_LAG 10
+#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
+}
+
+void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
+{
+#undef MAX_LAG
+#define MAX_LAG 8
+#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
+}
+
+#endif /* ifdef FLAC__HAS_A64NEONINTRIN */
+
 
 #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
                         summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
@@ -57,11 +81,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
 {
     int i;
     FLAC__int32 sum;
+    int32x4_t tmp_vec[20];
+
     FLAC__ASSERT(order > 0);
     FLAC__ASSERT(order <= 32);
 
-    int32x4_t tmp_vec[20];
-
     // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....)
     if(order <= 12) {
         if(order > 8) {
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index 38b19486..0aa98a21 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -907,6 +907,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 	}
 #endif
 #endif
+#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN
+	if(encoder->protected_->max_lpc_order < 8)
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8;
+	else if(encoder->protected_->max_lpc_order < 10)
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10;
+	else if(encoder->protected_->max_lpc_order < 14)
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14;
+	else
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
+#endif
 	if(encoder->private_->cpuinfo.use_asm) {
 #  ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
@@ -1004,7 +1014,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #   endif /* FLAC__HAS_X86INTRIN */
 #  endif /* FLAC__CPU_... */
 
-	#if defined FLAC__CPU_ARM64
+	#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN
+
     encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
     encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
     encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
author	Martijn van Beurden <mvanb1@gmail.com>	2022-04-29 16:26:41 +0200
committer	Martijn van Beurden <mvanb1@gmail.com>	2022-04-29 21:44:29 +0200
commit	ef4ad99231d3b67891c0f6d21b36859c221622a7 (patch)
tree	07e71be203416e0c4f6c29063e722657cc5c61fe /src
parent	bfe5ff9455ca62d7827e14a22fbb1522a63c4e3e (diff)
download	flac-ef4ad99231d3b67891c0f6d21b36859c221622a7.tar.gz