summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorAnupam Pandey <anupam.pandey@ittiam.com>2023-04-18 14:46:56 +0530
committerAnupam Pandey <anupam.pandey@ittiam.com>2023-05-05 15:55:16 +0530
commit255ee1888589aa15ae909b992fe123c0358b1730 (patch)
treed46b2799a29b05c325497d01d2b44b33d456ff1d /vp9
parent24802201acd7dfa15928bcc47c1e270e7db5afac (diff)
downloadlibvpx-255ee1888589aa15ae909b992fe123c0358b1730.tar.gz
Add AVX2 intrinsic for idct16x16 and idct32x32 functions
Added AVX2 intrinsic optimization for the following functions 1. vpx_idct16x16_256_add 2. vpx_idct32x32_1024_add 3. vpx_idct32x32_135_add The module level scaling w.r.t C function (timer based) for existing (SSE2) and new AVX2 intrinsics: Scaling Function Name SSE2 AVX2 vpx_idct32x32_1024_add 3.62x 7.49x vpx_idct32x32_135_add 4.85x 9.41x vpx_idct16x16_256_add 4.82x 7.70x This is a bit-exact change. Change-Id: Id9dda933aa1f5093bb6b35ac3b8a41846afca9d2
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_idct.c2
-rw-r--r--vp9/decoder/vp9_decoder.c2
-rw-r--r--vp9/decoder/vp9_decoder.h2
3 files changed, 4 insertions, 2 deletions
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 69069042c..71be0f310 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
+ assert(((intptr_t)input) % 32 == 0);
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob == 1) /* DC only DCT coefficient. */
@@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
+ assert(((intptr_t)input) % 32 == 0);
if (eob == 1)
vpx_idct32x32_1_add(input, dest, stride);
else if (eob <= 34)
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7db8ed72d..92cd91f1e 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -87,7 +87,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
row_mt_worker_data->num_sbs = num_sbs;
for (plane = 0; plane < 3; ++plane) {
CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
- vpx_memalign(16, dqcoeff_size));
+ vpx_memalign(32, dqcoeff_size));
memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index b0ef83c73..2e198d552 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -54,7 +54,7 @@ typedef struct TileWorkerData {
VP9LfSync *lf_sync;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
struct vpx_internal_error_info error_info;
} TileWorkerData;