summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/vpx_convolve8_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp/arm/vpx_convolve8_neon.h')
-rw-r--r--vpx_dsp/arm/vpx_convolve8_neon.h44
1 files changed, 27 insertions, 17 deletions
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 07cf8242d..2f78583af 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,10 +15,20 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#if VPX_ARCH_AARCH64 && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h);
+#endif
#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
const int8x16_t samples_hi,
const int32x4_t correction,
const int8x8_t filters) {
@@ -29,11 +39,11 @@ static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
const int8x8_t filters,
const int32x4_t correction,
const uint8x16_t range_limit,
@@ -54,8 +64,8 @@ static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
@@ -78,7 +88,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
@@ -111,14 +121,14 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
const uint8x16_t samples_hi,
const int8x8_t filters) {
/* Sample permutation is performed by the caller. */
@@ -127,11 +137,11 @@ static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
const int8x8_t filters,
const uint8x16x2_t permute_tbl) {
uint8x16_t permuted_samples[2];
@@ -147,8 +157,8 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
- /* Narrowing and packing is performed by the caller. */
- return sum;
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
}
static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
@@ -169,7 +179,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
@@ -196,7 +206,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
/* Narrow and re-pack. */
sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -238,7 +248,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
- return vqrshrun_n_s16(sum, 7);
+ return vqrshrun_n_s16(sum, FILTER_BITS);
}
static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,