summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKonstantinos Margaritis <konstantinos@vectorcamp.gr>2022-10-26 22:09:32 +0000
committerKonstantinos Margaritis <konstantinos@vectorcamp.gr>2022-11-03 17:55:13 +0000
commit3f08aa0d0b2828b670073f808ae079acb35902a4 (patch)
treec5559ff1d0b40d35a36392b76ffef8247bc128d6
parentf02a1191004e6190cfbb6efc38363f9f166d0256 (diff)
downloadlibvpx-3f08aa0d0b2828b670073f808ae079acb35902a4.tar.gz
[NEON] Optimize highbd 32x32 DCT
For --best quality, resulting function vpx_highbd_fdct32x32_rd_neon takes 0.27% of cpu time in profiling, vs 6.27% for the sum of scalar functions: vpx_fdct32, vpx_fdct32.constprop.0, vpx_fdct32x32_rd_c for rd. For --rt quality, the function takes 0.19% vs 4.57% for the scalar version. Overall, this improves encoding time by ~6% compared for highbd for --best and ~9% for --rt. Change-Id: I1ce4bbef6e364bbadc76264056aa3f86b1a8edc5
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.c185
-rw-r--r--vpx_dsp/arm/fdct32x32_neon.h1820
-rw-r--r--vpx_dsp/arm/fdct_neon.h9
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
4 files changed, 2013 insertions, 5 deletions
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index e2bf16760..d6818d2ec 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -230,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
&temp5[29], &temp5[30], &temp5[31]);
store(output + 24 * 32, temp5);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass(left5, right5);
+ highbd_partial_add_round_shift(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass(left6, right6);
+ highbd_partial_add_round_shift(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass(left7, right7);
+ highbd_partial_add_round_shift(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass(left8, right8);
+ highbd_partial_add_round_shift(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
// __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
index dd647918b..3b9e64c6d 100644
--- a/vpx_dsp/arm/fdct32x32_neon.h
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -143,7 +143,7 @@ static INLINE void scale_input(const int16x8_t *in /*32*/,
out[31] = vshlq_n_s16(in[31], 2);
}
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
@@ -494,7 +494,7 @@ static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
&b##_hi[sub_index]); \
} while (0)
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
int32x4_t c_lo[32];
@@ -800,7 +800,8 @@ static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
}
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+ int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
@@ -1102,4 +1103,1817 @@ static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
#undef BUTTERFLY_ONE_S32
#undef BUTTERFLY_TWO_S32
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+ tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+ const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+ const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+ const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+ int i;
+ for (i = 0; i < 32; i++) {
+ vst1q_s32(a, l1[i]);
+ vst1q_s32(a + 4, r1[i]);
+ vst1q_s32(a + 8, l2[i]);
+ vst1q_s32(a + 12, r2[i]);
+ vst1q_s32(a + 16, l3[i]);
+ vst1q_s32(a + 20, r3[i]);
+ vst1q_s32(a + 24, l4[i]);
+ vst1q_s32(a + 28, r4[i]);
+ a += 32;
+ }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+ int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+ left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+ left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+ left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+ left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+ left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+ left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+ left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+ left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+ left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+ left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+ left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+ left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+ left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+ left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+ left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+ left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+ right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+ right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+ right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+ right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+ right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+ right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+ right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+ right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+ right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+ right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+ right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+ right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+ right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+ right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+ right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+ right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+ int32x4_t *a_right /*[32]*/,
+ int32x4_t *b_left /*[32]*/,
+ int32x4_t *b_right /*[32]*/) {
+ // Stage 1. Done as part of the load for the first pass.
+ b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+ b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+ b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+ b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+ b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+ b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+ b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+ b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+ b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+ b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+ b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+ b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+ b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+ b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+ b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+ b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+ b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+ b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+ b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+ b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+ b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+ b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+ b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+ b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+ b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+ b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+ b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+ b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+ b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+ b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+ b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+ b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+ b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+ b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+ b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+ b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+ b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+ b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+ b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+ b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+ b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+ b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+ b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+ b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+ b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+ b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+ b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+ b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+ b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = add_round_shift_s32(left[0]);
+ left[1] = add_round_shift_s32(left[1]);
+ left[2] = add_round_shift_s32(left[2]);
+ left[3] = add_round_shift_s32(left[3]);
+ left[4] = add_round_shift_s32(left[4]);
+ left[5] = add_round_shift_s32(left[5]);
+ left[6] = add_round_shift_s32(left[6]);
+ left[7] = add_round_shift_s32(left[7]);
+ left[8] = add_round_shift_s32(left[8]);
+ left[9] = add_round_shift_s32(left[9]);
+ left[10] = add_round_shift_s32(left[10]);
+ left[11] = add_round_shift_s32(left[11]);
+ left[12] = add_round_shift_s32(left[12]);
+ left[13] = add_round_shift_s32(left[13]);
+ left[14] = add_round_shift_s32(left[14]);
+ left[15] = add_round_shift_s32(left[15]);
+ left[16] = add_round_shift_s32(left[16]);
+ left[17] = add_round_shift_s32(left[17]);
+ left[18] = add_round_shift_s32(left[18]);
+ left[19] = add_round_shift_s32(left[19]);
+ left[20] = add_round_shift_s32(left[20]);
+ left[21] = add_round_shift_s32(left[21]);
+ left[22] = add_round_shift_s32(left[22]);
+ left[23] = add_round_shift_s32(left[23]);
+ left[24] = add_round_shift_s32(left[24]);
+ left[25] = add_round_shift_s32(left[25]);
+ left[26] = add_round_shift_s32(left[26]);
+ left[27] = add_round_shift_s32(left[27]);
+ left[28] = add_round_shift_s32(left[28]);
+ left[29] = add_round_shift_s32(left[29]);
+ left[30] = add_round_shift_s32(left[30]);
+ left[31] = add_round_shift_s32(left[31]);
+
+ right[0] = add_round_shift_s32(right[0]);
+ right[1] = add_round_shift_s32(right[1]);
+ right[2] = add_round_shift_s32(right[2]);
+ right[3] = add_round_shift_s32(right[3]);
+ right[4] = add_round_shift_s32(right[4]);
+ right[5] = add_round_shift_s32(right[5]);
+ right[6] = add_round_shift_s32(right[6]);
+ right[7] = add_round_shift_s32(right[7]);
+ right[8] = add_round_shift_s32(right[8]);
+ right[9] = add_round_shift_s32(right[9]);
+ right[10] = add_round_shift_s32(right[10]);
+ right[11] = add_round_shift_s32(right[11]);
+ right[12] = add_round_shift_s32(right[12]);
+ right[13] = add_round_shift_s32(right[13]);
+ right[14] = add_round_shift_s32(right[14]);
+ right[15] = add_round_shift_s32(right[15]);
+ right[16] = add_round_shift_s32(right[16]);
+ right[17] = add_round_shift_s32(right[17]);
+ right[18] = add_round_shift_s32(right[18]);
+ right[19] = add_round_shift_s32(right[19]);
+ right[20] = add_round_shift_s32(right[20]);
+ right[21] = add_round_shift_s32(right[21]);
+ right[22] = add_round_shift_s32(right[22]);
+ right[23] = add_round_shift_s32(right[23]);
+ right[24] = add_round_shift_s32(right[24]);
+ right[25] = add_round_shift_s32(right[25]);
+ right[26] = add_round_shift_s32(right[26]);
+ right[27] = add_round_shift_s32(right[27]);
+ right[28] = add_round_shift_s32(right[28]);
+ right[29] = add_round_shift_s32(right[29]);
+ right[30] = add_round_shift_s32(right[30]);
+ right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = sub_round_shift_s32(left[0]);
+ left[1] = sub_round_shift_s32(left[1]);
+ left[2] = sub_round_shift_s32(left[2]);
+ left[3] = sub_round_shift_s32(left[3]);
+ left[4] = sub_round_shift_s32(left[4]);
+ left[5] = sub_round_shift_s32(left[5]);
+ left[6] = sub_round_shift_s32(left[6]);
+ left[7] = sub_round_shift_s32(left[7]);
+ left[8] = sub_round_shift_s32(left[8]);
+ left[9] = sub_round_shift_s32(left[9]);
+ left[10] = sub_round_shift_s32(left[10]);
+ left[11] = sub_round_shift_s32(left[11]);
+ left[12] = sub_round_shift_s32(left[12]);
+ left[13] = sub_round_shift_s32(left[13]);
+ left[14] = sub_round_shift_s32(left[14]);
+ left[15] = sub_round_shift_s32(left[15]);
+ left[16] = sub_round_shift_s32(left[16]);
+ left[17] = sub_round_shift_s32(left[17]);
+ left[18] = sub_round_shift_s32(left[18]);
+ left[19] = sub_round_shift_s32(left[19]);
+ left[20] = sub_round_shift_s32(left[20]);
+ left[21] = sub_round_shift_s32(left[21]);
+ left[22] = sub_round_shift_s32(left[22]);
+ left[23] = sub_round_shift_s32(left[23]);
+ left[24] = sub_round_shift_s32(left[24]);
+ left[25] = sub_round_shift_s32(left[25]);
+ left[26] = sub_round_shift_s32(left[26]);
+ left[27] = sub_round_shift_s32(left[27]);
+ left[28] = sub_round_shift_s32(left[28]);
+ left[29] = sub_round_shift_s32(left[29]);
+ left[30] = sub_round_shift_s32(left[30]);
+ left[31] = sub_round_shift_s32(left[31]);
+
+ right[0] = sub_round_shift_s32(right[0]);
+ right[1] = sub_round_shift_s32(right[1]);
+ right[2] = sub_round_shift_s32(right[2]);
+ right[3] = sub_round_shift_s32(right[3]);
+ right[4] = sub_round_shift_s32(right[4]);
+ right[5] = sub_round_shift_s32(right[5]);
+ right[6] = sub_round_shift_s32(right[6]);
+ right[7] = sub_round_shift_s32(right[7]);
+ right[8] = sub_round_shift_s32(right[8]);
+ right[9] = sub_round_shift_s32(right[9]);
+ right[10] = sub_round_shift_s32(right[10]);
+ right[11] = sub_round_shift_s32(right[11]);
+ right[12] = sub_round_shift_s32(right[12]);
+ right[13] = sub_round_shift_s32(right[13]);
+ right[14] = sub_round_shift_s32(right[14]);
+ right[15] = sub_round_shift_s32(right[15]);
+ right[16] = sub_round_shift_s32(right[16]);
+ right[17] = sub_round_shift_s32(right[17]);
+ right[18] = sub_round_shift_s32(right[18]);
+ right[19] = sub_round_shift_s32(right[19]);
+ right[20] = sub_round_shift_s32(right[20]);
+ right[21] = sub_round_shift_s32(right[21]);
+ right[22] = sub_round_shift_s32(right[22]);
+ right[23] = sub_round_shift_s32(right[23]);
+ right[24] = sub_round_shift_s32(right[24]);
+ right[25] = sub_round_shift_s32(right[25]);
+ right[26] = sub_round_shift_s32(right[26]);
+ right[27] = sub_round_shift_s32(right[27]);
+ right[28] = sub_round_shift_s32(right[28]);
+ right[29] = sub_round_shift_s32(right[29]);
+ right[30] = sub_round_shift_s32(right[30]);
+ right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+ ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+ al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+ ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+ al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+ ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+ al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+ ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+ al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+ ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+ al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+ ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+ al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+ ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+ al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+ ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+ al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+ ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+ al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+ ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+ al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+ ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+ al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+ ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+ al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+ ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+ al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+ ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+ al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+ ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+ al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+ ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+ al[16] = add_round_shift_s32(left[16]);
+ ar[16] = add_round_shift_s32(right[16]);
+ al[17] = add_round_shift_s32(left[17]);
+ ar[17] = add_round_shift_s32(right[17]);
+ al[18] = add_round_shift_s32(left[18]);
+ ar[18] = add_round_shift_s32(right[18]);
+ al[19] = add_round_shift_s32(left[19]);
+ ar[19] = add_round_shift_s32(right[19]);
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[20] = add_round_shift_s32(al[20]);
+ ar[20] = add_round_shift_s32(ar[20]);
+ al[21] = add_round_shift_s32(al[21]);
+ ar[21] = add_round_shift_s32(ar[21]);
+ al[22] = add_round_shift_s32(al[22]);
+ ar[22] = add_round_shift_s32(ar[22]);
+ al[23] = add_round_shift_s32(al[23]);
+ ar[23] = add_round_shift_s32(ar[23]);
+ al[24] = add_round_shift_s32(al[24]);
+ ar[24] = add_round_shift_s32(ar[24]);
+ al[25] = add_round_shift_s32(al[25]);
+ ar[25] = add_round_shift_s32(ar[25]);
+ al[26] = add_round_shift_s32(al[26]);
+ ar[26] = add_round_shift_s32(ar[26]);
+ al[27] = add_round_shift_s32(al[27]);
+ ar[27] = add_round_shift_s32(ar[27]);
+
+ al[28] = add_round_shift_s32(left[28]);
+ ar[28] = add_round_shift_s32(right[28]);
+ al[29] = add_round_shift_s32(left[29]);
+ ar[29] = add_round_shift_s32(right[29]);
+ al[30] = add_round_shift_s32(left[30]);
+ ar[30] = add_round_shift_s32(right[30]);
+ al[31] = add_round_shift_s32(left[31]);
+ ar[31] = add_round_shift_s32(right[31]);
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[16], al[23]);
+ br[16] = vaddq_s32(ar[16], ar[23]);
+ bl[17] = vaddq_s32(al[17], al[22]);
+ br[17] = vaddq_s32(ar[17], ar[22]);
+ bl[18] = vaddq_s32(al[18], al[21]);
+ br[18] = vaddq_s32(ar[18], ar[21]);
+ bl[19] = vaddq_s32(al[19], al[20]);
+ br[19] = vaddq_s32(ar[19], ar[20]);
+
+ bl[20] = vsubq_s32(al[19], al[20]);
+ br[20] = vsubq_s32(ar[19], ar[20]);
+ bl[21] = vsubq_s32(al[18], al[21]);
+ br[21] = vsubq_s32(ar[18], ar[21]);
+ bl[22] = vsubq_s32(al[17], al[22]);
+ br[22] = vsubq_s32(ar[17], ar[22]);
+ bl[23] = vsubq_s32(al[16], al[23]);
+ br[23] = vsubq_s32(ar[16], ar[23]);
+
+ bl[24] = vsubq_s32(al[31], al[24]);
+ br[24] = vsubq_s32(ar[31], ar[24]);
+ bl[25] = vsubq_s32(al[30], al[25]);
+ br[25] = vsubq_s32(ar[30], ar[25]);
+ bl[26] = vsubq_s32(al[29], al[26]);
+ br[26] = vsubq_s32(ar[29], ar[26]);
+ bl[27] = vsubq_s32(al[28], al[27]);
+ br[27] = vsubq_s32(ar[28], ar[27]);
+
+ bl[28] = vaddq_s32(al[28], al[27]);
+ br[28] = vaddq_s32(ar[28], ar[27]);
+ bl[29] = vaddq_s32(al[29], al[26]);
+ br[29] = vaddq_s32(ar[29], ar[26]);
+ bl[30] = vaddq_s32(al[30], al[25]);
+ br[30] = vaddq_s32(ar[30], ar[25]);
+ bl[31] = vaddq_s32(al[31], al[24]);
+ br[31] = vaddq_s32(ar[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+ -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+ -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+ &bl[2], &br[2], &bl[3], &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+ &bl[14], &br[14], &bl[9], &br[9]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+ -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+ &al[4], &ar[4], &al[7], &ar[7]);
+ butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+ &al[5], &ar[5], &al[6], &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+ -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+ cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+ -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+ &bl[8], &br[8], &bl[15], &br[15]);
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+ cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+ butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+ cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+ cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+ cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+ cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+ cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+ cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 1ea948b3f..b33da427b 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -355,4 +355,13 @@ static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
}
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 68244ea5a..d55ab67ce 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -568,10 +568,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_fdct16x16_1 neon/;
add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32 sse2/;
+ specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+ specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct32x32_1 neon/;