summaryrefslogtreecommitdiff
path: root/libavutil/mips
diff options
context:
space:
mode:
authorShiyou Yin <yinshiyou-hf@loongson.cn>2019-07-09 20:43:37 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2019-07-10 12:54:57 +0200
commita45e8ade2d2d46fde48ee0567ab18e23dc8c71d1 (patch)
tree1ca939b0771080f7b9402acfd4c0cd79f5f52035 /libavutil/mips
parent24f7a8a1688f88af153de4587de50cbf3084ee7d (diff)
downloadffmpeg-a45e8ade2d2d46fde48ee0567ab18e23dc8c71d1.tar.gz
avutil/mips: optimize UNPCK&SAD macros with MSA2.0 instruction.
Loongson 3A4000 and 2k1000 has supported MSA2.0. This patch optimized SAD_UB2_UH,UNPCK_R_SH_SW,UNPCK_SB_SH and UNPCK_SH_SW with MSA2.0 instruction. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavutil/mips')
-rw-r--r--libavutil/mips/generic_macros_msa.h42
1 files changed, 39 insertions, 3 deletions
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index 6a46704663..a3774281f9 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -23,6 +23,11 @@
#include <stdint.h>
#include <msa.h>
+#include <config.h>
+
+#if HAVE_MSA2
+#include <msa2.h>
+#endif
#define ALIGNMENT 16
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
@@ -1234,6 +1239,15 @@
unsigned absolute diff values, even-odd pairs are added
together to generate 8 halfword results.
*/
+#if HAVE_MSA2
+#define SAD_UB2_UH(in0, in1, ref0, ref1) \
+( { \
+ v8u16 sad_m = { 0 }; \
+ sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
+ sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
+ sad_m; \
+} )
+#else
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
( { \
v16u8 diff0_m, diff1_m; \
@@ -1247,6 +1261,7 @@
\
sad_m; \
} )
+#endif // #if HAVE_MSA2
/* Description : Insert specified word elements from input vectors to 1
destination vector
@@ -2287,6 +2302,12 @@
extracted and interleaved with same vector 'in0' to generate
4 word elements keeping sign intact
*/
+#if HAVE_MSA2
+#define UNPCK_R_SH_SW(in, out) \
+{ \
+ out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
+}
+#else
#define UNPCK_R_SH_SW(in, out) \
{ \
v8i16 sign_m; \
@@ -2294,6 +2315,7 @@
sign_m = __msa_clti_s_h((v8i16) in, 0); \
out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
}
+#endif // #if HAVE_MSA2
/* Description : Sign extend byte elements from input vector and return
halfword results in pair of vectors
@@ -2306,6 +2328,13 @@
Then interleaved left with same vector 'in0' to
generate 8 signed halfword elements in 'out1'
*/
+#if HAVE_MSA2
+#define UNPCK_SB_SH(in, out0, out1) \
+{ \
+ out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
+ out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
+}
+#else
#define UNPCK_SB_SH(in, out0, out1) \
{ \
v16i8 tmp_m; \
@@ -2313,6 +2342,7 @@
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
ILVRL_B2_SH(tmp_m, in, out0, out1); \
}
+#endif // #if HAVE_MSA2
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Inputs - in (1 input unsigned byte vector)
@@ -2339,6 +2369,13 @@
Then interleaved left with same vector 'in0' to
generate 4 signed word elements in 'out1'
*/
+#if HAVE_MSA2
+#define UNPCK_SH_SW(in, out0, out1) \
+{ \
+ out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
+ out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
+}
+#else
#define UNPCK_SH_SW(in, out0, out1) \
{ \
v8i16 tmp_m; \
@@ -2346,6 +2383,7 @@
tmp_m = __msa_clti_s_h((v8i16) in, 0); \
ILVRL_H2_SW(tmp_m, in, out0, out1); \
}
+#endif // #if HAVE_MSA2
/* Description : Swap two variables
Arguments : Inputs - in0, in1
@@ -2850,13 +2888,11 @@
*/
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
( { \
- v8i16 tmp1_m; \
v8i16 out0_m; \
\
out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
- tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
- out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
+ out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
\
out0_m; \
} )