diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S | 320 |
1 files changed, 252 insertions, 68 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S index 0a0c7768b13..eee3a9636de 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S @@ -124,6 +124,13 @@ endconst .endif .endm +.macro smin_4s r0, r1, r2 + smin \r0\().4s, \r1\().4s, \r2\().4s +.endm +.macro smax_4s r0, r1, r2 + smax \r0\().4s, \r1\().4s, \r2\().4s +.endm + .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 @@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + smin_4s \r, \r, v5 +.endr +.irp r, \r0, \r2, \r4, \r6 + smax_4s \r, \r, v4 +.endr + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a - mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a - srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a @@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a - mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 +.irp r, v2, \r1, v3, \r3 + smin_4s \r, \r, v5 +.endr +.irp r, v2, \r1, v3, \r3 + smax_4s \r, \r, v4 +.endr + + mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 - srshr v4.4s, v4.4s, #12 // t5 - srshr v5.4s, v6.4s, #12 // t6 + srshr v7.4s, v7.4s, #12 // t5 + srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 - sqadd \r1\().4s, \r2\().4s, v5.4s // out1 - sqsub v6.4s, \r2\().4s, v5.4s // out6 - sqadd \r2\().4s, \r4\().4s, v4.4s // out2 - sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r1\().4s, \r2\().4s, v6.4s // out1 + sqsub v6.4s, \r2\().4s, v6.4s // out6 + sqadd \r2\().4s, \r4\().4s, v7.4s // out2 + sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 @@ -660,8 +683,11 @@ endfunc ld1 {v0.4s}, [x16] + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 + mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 @@ -669,6 +695,13 @@ endfunc sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smax_4s \r, \r, v20 +.endr + mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] @@ -685,12 +718,24 @@ endfunc sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 - sqneg \o7\().4s, \o7\().4s // out7 + + mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v4, v3, v5 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v4, v3, v5 + smax_4s \r, \r, v18 +.endr + + sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) @@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + // idct_8 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smax \r\().4s, \r\().4s, v4.4s +.endr + ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a - mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a - srshr v31.4s, v4.4s, #12 // t15a + srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a - mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a - srshr v21.4s, v4.4s, #12 // t10a + srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a - mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a - srshr v29.4s, v4.4s, #12 // t12a + srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] @@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 - mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a - srshr v21.4s, v4.4s, #12 // t9a + srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a - mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a - srshr v29.4s, v4.4s, #12 // t13a + srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a @@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a - srshr v4.4s, v4.4s, #12 // t11 - srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a + srshr v7.4s, v7.4s, #12 // t11 + srshr v6.4s, v6.4s, #12 // t12 + mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a - srshr v3.4s, v6.4s, #12 // t13a + srshr v3.4s, v3.4s, #12 // t13a - sqadd v6.4s, v16.4s, v31.4s // out0 + sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 - mov v16.16b, v6.16b + mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 - sqsub v7.4s, v30.4s, v17.4s // out8 + sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 - sqadd v19.4s, v22.4s, v5.4s // out3 - sqsub v28.4s, v22.4s, v5.4s // out12 - sqadd v20.4s, v24.4s, v4.4s // out4 - sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v19.4s, v22.4s, v6.4s // out3 + sqsub v28.4s, v22.4s, v6.4s // out12 + sqadd v20.4s, v24.4s, v7.4s // out4 + sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 - mov v24.16b, v7.16b + mov v24.16b, v1.16b mov v22.16b, v3.16b ret @@ -1084,6 +1151,9 @@ endfunc ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a @@ -1101,6 +1171,13 @@ endfunc sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 @@ -1135,6 +1212,13 @@ endfunc sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a @@ -1163,24 +1247,34 @@ endfunc sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif - sqneg \o15\().4s, \o15\().4s // out15 sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a - sqneg \o13\().4s, \o13\().4s // out13 sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 - sqneg \o1\().4s, \o1\().4s // out1 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smax_4s \r, \r, v7 +.endr + + sqneg \o15\().4s, \o15\().4s // out15 + sqneg \o13\().4s, \o13\().4s // out13 + sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) @@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 @@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 - mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a - srshr v21.4s, v4.4s, #12 // t17a + srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a - mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a - srshr v24.4s, v4.4s, #12 // t29a + srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a - mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a - neg v4.4s, v4.4s // -> t22a + neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a - srshr v17.4s, v4.4s, #12 // t22a + srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 @@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a - mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 - srshr v18.4s, v4.4s, #12 // t18a + srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a - mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 - srshr v24.4s, v4.4s, #12 // t28 + srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 - mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 - neg v4.4s, v4.4s // -> t21a + neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a - srshr v20.4s, v4.4s, #12 // t21a + srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 @@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 - sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a - mov v19.16b, v4.16b // out19 + mov v19.16b, v7.16b // out19 - mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 - srshr v20.4s, v4.4s, #12 // t20 + srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 - mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 - srshr v26.4s, v4.4s, #12 // t26a + srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 - mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 - srshr v25.4s, v4.4s, #12 // t25 + srshr v25.4s, v7.4s, #12 // t25 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a - srshr v23.4s, v4.4s, #12 // t23a + srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret @@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 @@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a - mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a - srshr v29.4s, v4.4s, #12 // t61a + srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a @@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a - mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a - srshr v18.4s, v4.4s, #12 // t34a + srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 @@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a - mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a - srshr v27.4s, v4.4s, #12 // t39a + srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a @@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a - mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a - srshr v21.4s, v4.4s, #12 // t55a + srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 @@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon bl inv_dct_4s_x16_neon + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale @@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon mov x9, #-16 + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 @@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 + smin v6.4s, v6.4s, v1.4s + smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 + smax v6.4s, v6.4s, v0.4s + smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 + smin v7.4s, v7.4s, v1.4s + smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 + smin v2.4s, v2.4s, v1.4s + smin \r2, \r2, v1.4s + smax v7.4s, v7.4s, v0.4s + smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 + smax v2.4s, v2.4s, v0.4s + smax \r2, \r2, v0.4s + smin v3.4s, v3.4s, v1.4s + smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 + smax v3.4s, v3.4s, v0.4s + smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 @@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon add x6, x6, #4*4*16 movrel x17, idct64_coeffs + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 |