diff options
author | Anupam Pandey <anupam.pandey@ittiam.com> | 2023-04-18 14:46:56 +0530 |
---|---|---|
committer | Anupam Pandey <anupam.pandey@ittiam.com> | 2023-05-05 15:55:16 +0530 |
commit | 255ee1888589aa15ae909b992fe123c0358b1730 (patch) | |
tree | d46b2799a29b05c325497d01d2b44b33d456ff1d /vp9 | |
parent | 24802201acd7dfa15928bcc47c1e270e7db5afac (diff) | |
download | libvpx-255ee1888589aa15ae909b992fe123c0358b1730.tar.gz |
Add AVX2 intrinsic for idct16x16 and idct32x32 functions
Added AVX2 intrinsic optimization for the following functions
1. vpx_idct16x16_256_add
2. vpx_idct32x32_1024_add
3. vpx_idct32x32_135_add
The module level scaling w.r.t C function (timer based) for
existing (SSE2) and new AVX2 intrinsics:
Scaling
Function Name SSE2 AVX2
vpx_idct32x32_1024_add 3.62x 7.49x
vpx_idct32x32_135_add 4.85x 9.41x
vpx_idct16x16_256_add 4.82x 7.70x
This is a bit-exact change.
Change-Id: Id9dda933aa1f5093bb6b35ac3b8a41846afca9d2
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_idct.c | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.c | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.h | 2 |
3 files changed, 4 insertions, 2 deletions
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 69069042c..71be0f310 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob == 1) /* DC only DCT coefficient. */ @@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); if (eob == 1) vpx_idct32x32_1_add(input, dest, stride); else if (eob <= 34) diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 7db8ed72d..92cd91f1e 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -87,7 +87,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, row_mt_worker_data->num_sbs = num_sbs; for (plane = 0; plane < 3; ++plane) { CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], - vpx_memalign(16, dqcoeff_size)); + vpx_memalign(32, dqcoeff_size)); memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index b0ef83c73..2e198d552 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -54,7 +54,7 @@ typedef struct TileWorkerData { VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]); struct vpx_internal_error_info error_info; } TileWorkerData; |