summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d')
-rw-r--r--chromium/third_party/dav1d/dav1d_generated.gni5
-rw-r--r--chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml73
-rw-r--r--chromium/third_party/dav1d/libdav1d/NEWS30
-rw-r--r--chromium/third_party/dav1d/libdav1d/doc/meson.build4
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c767
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c123
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h61
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h132
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c723
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c164
-rw-r--r--chromium/third_party/dav1d/libdav1d/examples/meson.build30
-rw-r--r--chromium/third_party/dav1d/libdav1d/gcovr.cfg3
-rw-r--r--chromium/third_party/dav1d/libdav1d/meson.build14
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S3386
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S575
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/32/util.S32
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S280
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S3526
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S132
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/util.S12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/asm.S15
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c52
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/msac.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/decode.c2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/getbits.c34
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/itx.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/log.c2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/log.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/meson.build15
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/msac.c3
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c10
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/refmvs.c11
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/tables.c23
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/tables.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm8
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm30
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx.asm4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm227
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm47
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc.asm1949
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c110
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc_sse.asm (renamed from chromium/third_party/dav1d/libdav1d/src/x86/mc_ssse3.asm)1479
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/msac.asm4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c3
-rw-r--r--chromium/third_party/dav1d/libdav1d/tools/dav1d.c8
-rw-r--r--chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c2
-rw-r--r--chromium/third_party/dav1d/libdav1d/tools/input/input.c4
-rw-r--r--chromium/third_party/dav1d/libdav1d/tools/input/ivf.c24
-rw-r--r--chromium/third_party/dav1d/libdav1d/tools/input/parse.h40
52 files changed, 12595 insertions, 1611 deletions
diff --git a/chromium/third_party/dav1d/dav1d_generated.gni b/chromium/third_party/dav1d/dav1d_generated.gni
index 8becac02308..9e15c31fc28 100644
--- a/chromium/third_party/dav1d/dav1d_generated.gni
+++ b/chromium/third_party/dav1d/dav1d_generated.gni
@@ -20,7 +20,7 @@ x86_asm_sources = [
"libdav1d/src/x86/looprestoration.asm",
"libdav1d/src/x86/looprestoration_ssse3.asm",
"libdav1d/src/x86/mc.asm",
- "libdav1d/src/x86/mc_ssse3.asm",
+ "libdav1d/src/x86/mc_sse.asm",
"libdav1d/src/x86/msac.asm",
]
@@ -37,9 +37,11 @@ x86_template_sources = [
arm32_asm_sources = [
"libdav1d/src/arm/32/cdef.S",
"libdav1d/src/arm/32/ipred.S",
+ "libdav1d/src/arm/32/itx.S",
"libdav1d/src/arm/32/loopfilter.S",
"libdav1d/src/arm/32/looprestoration.S",
"libdav1d/src/arm/32/mc.S",
+ "libdav1d/src/arm/32/msac.S",
"libdav1d/src/arm/32/util.S",
]
@@ -50,6 +52,7 @@ arm64_asm_sources = [
"libdav1d/src/arm/64/ipred.S",
"libdav1d/src/arm/64/ipred16.S",
"libdav1d/src/arm/64/itx.S",
+ "libdav1d/src/arm/64/itx16.S",
"libdav1d/src/arm/64/loopfilter.S",
"libdav1d/src/arm/64/loopfilter16.S",
"libdav1d/src/arm/64/looprestoration.S",
diff --git a/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml b/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml
index bdef928a40d..c921b6a122f 100644
--- a/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml
+++ b/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml
@@ -4,7 +4,7 @@ stages:
- test
.debian-amd64-common:
- image: registry.videolan.org/dav1d-debian-unstable:20200306210534
+ image: registry.videolan.org/dav1d-debian-unstable:20200602183013
stage: build
tags:
- docker
@@ -52,6 +52,7 @@ stages:
- docker
- amd64
+
style-check:
extends: .debian-amd64-common
stage: style
@@ -80,6 +81,7 @@ style-check:
fi;
done
+
build-debian:
extends: .debian-amd64-common
tags:
@@ -91,6 +93,10 @@ build-debian:
--werror
- ninja -C build
- cd build && meson test -v
+ artifacts:
+ paths:
+ - build/
+ expire_in: 1 day
build-debian-static:
extends: .debian-amd64-common
@@ -110,6 +116,10 @@ build-debian32:
--cross-file package/crossfiles/i686-linux32.meson
- ninja -C build
- cd build && meson test -v
+ artifacts:
+ paths:
+ - build/
+ expire_in: 1 day
build-debian-examples:
extends: .debian-amd64-common
@@ -331,6 +341,7 @@ build-debian-ppc64le:
- ninja -C build
- cd build && meson test -v
+
.test-common:
stage: test
cache:
@@ -344,6 +355,25 @@ build-debian-ppc64le:
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
dependencies: []
+.test-asm-common:
+ extends:
+ - .debian-amd64-common
+ - .test-common
+ tags:
+ - docker
+ - amd64
+ - avx2
+ script:
+ - meson configure build -Dtestdata_tests=true
+ - cd build
+ - exit_code=0
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2" || exit_code=$((exit_code + $?))
+ - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
+
test-debian:
extends:
- .debian-amd64-common
@@ -353,8 +383,48 @@ test-debian:
- meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
+ -Db_coverage=true
- ninja -C build
- cd build && time meson test -v
+ - ninja coverage-html
+ - mv meson-logs/coveragereport ../coverage
+ - ninja coverage-xml
+ - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 |
+ grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
+ coverage: '/^coverage: (\d+.\d+)$/'
+ artifacts:
+ expose_as: 'Coverage HTML report'
+ paths:
+ - coverage/
+ reports:
+ cobertura: build/meson-logs/coverage.xml
+
+test-debian-asm:
+ extends:
+ - .test-asm-common
+ needs: ["build-debian"]
+ dependencies: ["build-debian"]
+
+test-debian32-asm:
+ extends:
+ - .test-asm-common
+ needs: ["build-debian32"]
+ dependencies: ["build-debian32"]
+
+test-debian-mt:
+ extends:
+ - .debian-amd64-common
+ - .test-common
+ needs: ["build-debian"]
+ dependencies: ["build-debian"]
+ script:
+ - meson configure build -Dtestdata_tests=true
+ - cd build
+ - exit_code=0
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?))
+ - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?))
+ - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-debian-unaligned-stack:
extends:
@@ -482,6 +552,7 @@ test-debian-armv7-clang-5:
- ninja -C build
- cd build && time meson test -v
+
.pages-common:
extends: .debian-amd64-common
script:
diff --git a/chromium/third_party/dav1d/libdav1d/NEWS b/chromium/third_party/dav1d/libdav1d/NEWS
index 46695fd7ea2..1294dc52caf 100644
--- a/chromium/third_party/dav1d/libdav1d/NEWS
+++ b/chromium/third_party/dav1d/libdav1d/NEWS
@@ -1,3 +1,33 @@
+Changes for 0.7.1 'Frigatebird':
+------------------------------
+
+0.7.1 is a minor update on 0.7.0:
+ - ARM32 NEON optimizations for itxfm, which can give up to 28% speedup, and MSAC
+ - SSE2 optimizations for prep_bilin and prep_8tap
+ - AVX2 optimizations for MC scaled
+ - Fix a clamping issue in motion vector projection
+ - Fix an issue on some specific Haswell CPU on ipred_z AVX2 functions
+ - Improvements on the dav1dplay utility player to support resizing
+
+
+Changes for 0.7.0 'Frigatebird':
+------------------------------
+
+0.7.0 is a major release for dav1d:
+ - Faster refmv implementation gaining up to 12% speed while -25% of RAM (Single Thread)
+ - 10b/12b ARM64 optimizations are mostly complete:
+ - ipred (paeth, smooth, dc, pal, filter, cfl)
+ - itxfm (only 10b)
+ - AVX2/SSSE3 for non-4:2:0 film grain and for mc.resize
+ - AVX2 for cfl4:4:4
+ - AVX-512 CDEF filter
+ - ARM64 8b improvements for cfl_ac and itxfm
+ - ARM64 implementation for emu_edge in 8b/10b/12b
+ - ARM32 implementation for emu_edge in 8b
+ - Improvements on the dav1dplay utility player to support 10 bit,
+ non-4:2:0 pixel formats and film grain on the GPU
+
+
Changes for 0.6.0 'Gyrfalcon':
------------------------------
diff --git a/chromium/third_party/dav1d/libdav1d/doc/meson.build b/chromium/third_party/dav1d/libdav1d/doc/meson.build
index 4badbf6ea91..0ef7123448a 100644
--- a/chromium/third_party/dav1d/libdav1d/doc/meson.build
+++ b/chromium/third_party/dav1d/libdav1d/doc/meson.build
@@ -27,8 +27,8 @@ dot = find_program('dot', required: false)
if doxygen.found() and dot.found()
conf_data = configuration_data()
- conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d'))
- conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include'))
+ conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d'))
+ conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include'))
conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir())
doxyfile = configure_file(input: 'Doxyfile.in',
output: 'Doxyfile',
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c b/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c
index bcd4835b320..d6bb262b56c 100644
--- a/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c
+++ b/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c
@@ -29,687 +29,18 @@
#include <getopt.h>
#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
#include <SDL.h>
-#include "common/attributes.h"
-
#include "dav1d/dav1d.h"
+#include "common/attributes.h"
#include "tools/input/input.h"
+#include "dp_fifo.h"
+#include "dp_renderer.h"
-/**
- * Settings structure
- * Hold all settings available for the player,
- * this is usually filled by parsing arguments
- * from the console.
- */
-typedef struct {
- const char *inputfile;
- int highquality;
- int untimed;
- int zerocopy;
-} Dav1dPlaySettings;
-
-#define WINDOW_WIDTH 910
-#define WINDOW_HEIGHT 512
-
-#define DAV1D_EVENT_NEW_FRAME 1
-#define DAV1D_EVENT_DEC_QUIT 2
-
-/*
- * Fifo helper functions
- */
-typedef struct dp_fifo
-{
- SDL_mutex *lock;
- SDL_cond *cond_change;
- size_t capacity;
- size_t count;
- void **entries;
-} Dav1dPlayPtrFifo;
-
-static void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
-{
- assert(fifo->count == 0);
- SDL_DestroyMutex(fifo->lock);
- SDL_DestroyCond(fifo->cond_change);
- free(fifo->entries);
- free(fifo);
-}
-
-static Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
-{
- Dav1dPlayPtrFifo *fifo;
-
- assert(capacity > 0);
- if (capacity <= 0)
- return NULL;
-
- fifo = malloc(sizeof(*fifo));
- if (fifo == NULL)
- return NULL;
-
- fifo->capacity = capacity;
- fifo->count = 0;
-
- fifo->lock = SDL_CreateMutex();
- if (fifo->lock == NULL) {
- free(fifo);
- return NULL;
- }
- fifo->cond_change = SDL_CreateCond();
- if (fifo->cond_change == NULL) {
- SDL_DestroyMutex(fifo->lock);
- free(fifo);
- return NULL;
- }
-
- fifo->entries = calloc(capacity, sizeof(void*));
- if (fifo->entries == NULL) {
- dp_fifo_destroy(fifo);
- return NULL;
- }
-
- return fifo;
-}
-
-static void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
-{
- SDL_LockMutex(fifo->lock);
- while (fifo->count == fifo->capacity)
- SDL_CondWait(fifo->cond_change, fifo->lock);
- fifo->entries[fifo->count++] = element;
- if (fifo->count == 1)
- SDL_CondSignal(fifo->cond_change);
- SDL_UnlockMutex(fifo->lock);
-}
-
-static void *dp_fifo_array_shift(void **arr, size_t len)
-{
- void *shifted_element = arr[0];
- for (size_t i = 1; i < len; ++i)
- arr[i-1] = arr[i];
- return shifted_element;
-}
-
-static void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
-{
- SDL_LockMutex(fifo->lock);
- while (fifo->count == 0)
- SDL_CondWait(fifo->cond_change, fifo->lock);
- void *res = dp_fifo_array_shift(fifo->entries, fifo->count--);
- if (fifo->count == fifo->capacity - 1)
- SDL_CondSignal(fifo->cond_change);
- SDL_UnlockMutex(fifo->lock);
- return res;
-}
-
-/**
- * Renderer info
- */
-typedef struct rdr_info
-{
- // Cookie passed to the renderer implementation callbacks
- void *cookie;
- // Callback to create the renderer
- void* (*create_renderer)(void *data);
- // Callback to destroy the renderer
- void (*destroy_renderer)(void *cookie);
- // Callback to the render function that renders a prevously sent frame
- void (*render)(void *cookie, const Dav1dPlaySettings *settings);
- // Callback to the send frame function
- int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
- const Dav1dPlaySettings *settings);
- // Callback for alloc/release pictures (optional)
- int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
- void (*release_pic)(Dav1dPicture *pic, void *cookie);
-} Dav1dPlayRenderInfo;
-
-#ifdef HAVE_PLACEBO_VULKAN
-
-#include <libplacebo/renderer.h>
-#include <libplacebo/utils/upload.h>
-#include <libplacebo/vulkan.h>
-#include <SDL_vulkan.h>
-
-
-/**
- * Renderer context for libplacebo
- */
-typedef struct renderer_priv_ctx
-{
- // Placebo context
- struct pl_context *ctx;
- // Placebo renderer
- struct pl_renderer *renderer;
- // Placebo Vulkan handle
- const struct pl_vulkan *vk;
- // Placebo Vulkan instance
- const struct pl_vk_inst *vk_inst;
- // Vulkan surface
- VkSurfaceKHR surf;
- // Placebo swapchain
- const struct pl_swapchain *swapchain;
- // Lock protecting access to the texture
- SDL_mutex *lock;
- // Planes to render
- struct pl_plane y_plane;
- struct pl_plane u_plane;
- struct pl_plane v_plane;
- // Textures to render
- const struct pl_tex *y_tex;
- const struct pl_tex *u_tex;
- const struct pl_tex *v_tex;
-} Dav1dPlayRendererPrivateContext;
-
-static void *placebo_renderer_create(void *data)
-{
- // Alloc
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
- if (rd_priv_ctx == NULL) {
- return NULL;
- }
-
- // Init libplacebo
- rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
- .log_cb = pl_log_color,
-#ifndef NDEBUG
- .log_level = PL_LOG_DEBUG,
-#else
- .log_level = PL_LOG_WARN,
-#endif
- });
- if (rd_priv_ctx->ctx == NULL) {
- free(rd_priv_ctx);
- return NULL;
- }
-
- // Create Mutex
- rd_priv_ctx->lock = SDL_CreateMutex();
- if (rd_priv_ctx->lock == NULL) {
- fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
- pl_context_destroy(&(rd_priv_ctx->ctx));
- free(rd_priv_ctx);
- return NULL;
- }
-
- // Init Vulkan
- struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
-
- SDL_Window *sdlwin = data;
-
- unsigned num = 0;
- if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) {
- fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError());
- exit(1);
- }
-
- iparams.extensions = malloc(num * sizeof(const char *));
- iparams.num_extensions = num;
- assert(iparams.extensions);
-
- SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, iparams.extensions);
- if (!ok) {
- fprintf(stderr, "Failed getting Vk instance extensions\n");
- exit(1);
- }
-
- if (num > 0) {
- printf("Requesting %d additional Vulkan extensions:\n", num);
- for (unsigned i = 0; i < num; i++)
- printf(" %s\n", iparams.extensions[i]);
- }
-
- rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams);
- if (!rd_priv_ctx->vk_inst) {
- fprintf(stderr, "Failed creating Vulkan instance!\n");
- exit(1);
- }
- free(iparams.extensions);
-
- if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) {
- fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError());
- exit(1);
- }
-
- struct pl_vulkan_params params = pl_vulkan_default_params;
- params.instance = rd_priv_ctx->vk_inst->instance;
- params.surface = rd_priv_ctx->surf;
- params.allow_software = true;
-
- rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, &params);
- if (!rd_priv_ctx->vk) {
- fprintf(stderr, "Failed creating vulkan device!\n");
- exit(2);
- }
-
- // Create swapchain
- rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk,
- &(struct pl_vulkan_swapchain_params) {
- .surface = rd_priv_ctx->surf,
- .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR,
- });
-
- if (!rd_priv_ctx->swapchain) {
- fprintf(stderr, "Failed creating vulkan swapchain!\n");
- exit(2);
- }
-
- int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
- if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
- fprintf(stderr, "Failed resizing vulkan swapchain!\n");
- exit(2);
- }
-
- if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
- printf("Note: window dimensions differ (got %dx%d)\n", w, h);
-
- rd_priv_ctx->y_tex = NULL;
- rd_priv_ctx->u_tex = NULL;
- rd_priv_ctx->v_tex = NULL;
-
- rd_priv_ctx->renderer = NULL;
-
- return rd_priv_ctx;
-}
-
-static void placebo_renderer_destroy(void *cookie)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- pl_renderer_destroy(&(rd_priv_ctx->renderer));
- pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_tex));
- pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_tex));
- pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_tex));
- pl_swapchain_destroy(&(rd_priv_ctx->swapchain));
- pl_vulkan_destroy(&(rd_priv_ctx->vk));
- vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
- pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
- pl_context_destroy(&(rd_priv_ctx->ctx));
-}
-
-static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- SDL_LockMutex(rd_priv_ctx->lock);
- if (rd_priv_ctx->y_tex == NULL) {
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return;
- }
-
- // Prepare rendering
- if (rd_priv_ctx->renderer == NULL) {
- rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->vk->gpu);
- }
-
- struct pl_swapchain_frame frame;
- bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame);
- if (!ok) {
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return;
- }
-
- const struct pl_tex *img = rd_priv_ctx->y_plane.texture;
- struct pl_image image = {
- .num_planes = 3,
- .planes = { rd_priv_ctx->y_plane, rd_priv_ctx->u_plane, rd_priv_ctx->v_plane },
- .repr = pl_color_repr_hdtv,
- .color = pl_color_space_unknown,
- .width = img->params.w,
- .height = img->params.h,
- };
-
- struct pl_render_params render_params = {0};
- if (settings->highquality)
- render_params = pl_render_default_params;
-
- struct pl_render_target target;
- pl_render_target_from_swapchain(&target, &frame);
- target.profile = (struct pl_icc_profile) {
- .data = NULL,
- .len = 0,
- };
-
- if (!pl_render_image(rd_priv_ctx->renderer, &image, &target, &render_params)) {
- fprintf(stderr, "Failed rendering frame!\n");
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return;
- }
-
- ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
- if (!ok) {
- fprintf(stderr, "Failed submitting frame!\n");
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return;
- }
-
- pl_swapchain_swap_buffers(rd_priv_ctx->swapchain);
- SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
- const Dav1dPlaySettings *settings)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- SDL_LockMutex(rd_priv_ctx->lock);
-
- if (dav1d_pic == NULL) {
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return 0;
- }
-
- int width = dav1d_pic->p.w;
- int height = dav1d_pic->p.h;
-
- enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
-
- if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
- fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
- exit(50);
- }
-
- struct pl_plane_data data_y = {
- .type = PL_FMT_UNORM,
- .width = width,
- .height = height,
- .pixel_stride = 1,
- .row_stride = dav1d_pic->stride[0],
- .component_size = {8},
- .component_map = {0},
- };
-
- struct pl_plane_data data_u = {
- .type = PL_FMT_UNORM,
- .width = width/2,
- .height = height/2,
- .pixel_stride = 1,
- .row_stride = dav1d_pic->stride[1],
- .component_size = {8},
- .component_map = {1},
- };
-
- struct pl_plane_data data_v = {
- .type = PL_FMT_UNORM,
- .width = width/2,
- .height = height/2,
- .pixel_stride = 1,
- .row_stride = dav1d_pic->stride[1],
- .component_size = {8},
- .component_map = {2},
- };
-
- if (settings->zerocopy) {
- const struct pl_buf *buf = dav1d_pic->allocator_data;
- assert(buf);
- data_y.buf = data_u.buf = data_v.buf = buf;
- data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
- data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
- data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
- } else {
- data_y.pixels = dav1d_pic->data[0];
- data_u.pixels = dav1d_pic->data[1];
- data_v.pixels = dav1d_pic->data[2];
- }
-
- bool ok = true;
- ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
- ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
- ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_plane), &(rd_priv_ctx->v_tex), &data_v);
-
- pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->u_plane.shift_x, &rd_priv_ctx->u_plane.shift_y);
- pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->v_plane.shift_x, &rd_priv_ctx->v_plane.shift_y);
-
- if (!ok) {
- fprintf(stderr, "Failed uploading planes!\n");
- }
-
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return !ok;
-}
-
-// Align to power of 2
-#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
-
-static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
- SDL_LockMutex(rd_priv_ctx->lock);
-
- const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
- int ret = DAV1D_ERR(ENOMEM);
-
- // Copied from dav1d_default_picture_alloc
- const int hbd = p->p.bpc > 8;
- const int aligned_w = ALIGN2(p->p.w, 128);
- const int aligned_h = ALIGN2(p->p.h, 128);
- const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
- const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
- p->stride[0] = aligned_w << hbd;
- p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
-
- // Align strides up to multiples of the GPU performance hints
- p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
- p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
-
- // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
- size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
- const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
- const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
-
- // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
- // even in the case that the driver gives us insane alignments
- const size_t pic_size = y_sz + 2 * uv_sz;
- const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
-
- // Validate size limitations
- if (total_size > gpu->limits.max_xfer_size) {
- printf("alloc of %zu bytes exceeds limits\n", total_size);
- goto err;
- }
-
- const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
- .type = PL_BUF_TEX_TRANSFER,
- .host_mapped = true,
- .size = total_size,
- .memory_type = PL_BUF_MEM_HOST,
- .user_data = p,
- });
-
- if (!buf) {
- printf("alloc of GPU mapped buffer failed\n");
- goto err;
- }
-
- assert(buf->data);
- uintptr_t base = (uintptr_t) buf->data, data[3];
- data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
- data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
- data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
-
- // Sanity check offset alignment for the sake of debugging
- if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
- data[1] - base != ALIGN2(data[1] - base, off_align) ||
- data[2] - base != ALIGN2(data[2] - base, off_align))
- {
- printf("GPU buffer horribly misaligned, expect slowdown!\n");
- }
-
- p->allocator_data = (void *) buf;
- p->data[0] = (void *) data[0];
- p->data[1] = (void *) data[1];
- p->data[2] = (void *) data[2];
- ret = 0;
-
- // fall through
-err:
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return ret;
-}
-
-static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
- assert(pic->allocator_data);
-
- SDL_LockMutex(rd_priv_ctx->lock);
- const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
- pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
- SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static const Dav1dPlayRenderInfo renderer_info = {
- .create_renderer = placebo_renderer_create,
- .destroy_renderer = placebo_renderer_destroy,
- .render = placebo_render,
- .update_frame = placebo_upload_planes,
- .alloc_pic = placebo_alloc_pic,
- .release_pic = placebo_release_pic,
-};
-
-#else
-
-/**
- * Renderer context for SDL
- */
-typedef struct renderer_priv_ctx
-{
- // SDL renderer
- SDL_Renderer *renderer;
- // Lock protecting access to the texture
- SDL_mutex *lock;
- // Texture to render
- SDL_Texture *tex;
-} Dav1dPlayRendererPrivateContext;
-
-static void *sdl_renderer_create(void *data)
-{
- SDL_Window *win = data;
-
- // Alloc
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
- if (rd_priv_ctx == NULL) {
- return NULL;
- }
-
- // Create renderer
- rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
- // Set scale quality
- SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear");
-
- // Create Mutex
- rd_priv_ctx->lock = SDL_CreateMutex();
- if (rd_priv_ctx->lock == NULL) {
- fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
- free(rd_priv_ctx);
- return NULL;
- }
-
- rd_priv_ctx->tex = NULL;
-
- return rd_priv_ctx;
-}
-
-static void sdl_renderer_destroy(void *cookie)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- SDL_DestroyRenderer(rd_priv_ctx->renderer);
- SDL_DestroyMutex(rd_priv_ctx->lock);
- free(rd_priv_ctx);
-}
-
-static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- SDL_LockMutex(rd_priv_ctx->lock);
-
- if (rd_priv_ctx->tex == NULL) {
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return;
- }
-
- // Display the frame
- SDL_RenderClear(rd_priv_ctx->renderer);
- SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL);
- SDL_RenderPresent(rd_priv_ctx->renderer);
-
- SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
- const Dav1dPlaySettings *settings)
-{
- Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
- assert(rd_priv_ctx != NULL);
-
- SDL_LockMutex(rd_priv_ctx->lock);
-
- if (dav1d_pic == NULL) {
- rd_priv_ctx->tex = NULL;
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return 0;
- }
-
- int width = dav1d_pic->p.w;
- int height = dav1d_pic->p.h;
- int tex_w = width;
- int tex_h = height;
-
- enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
-
- if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
- fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
- exit(50);
- }
-
- SDL_Texture *texture = rd_priv_ctx->tex;
- if (texture != NULL) {
- SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h);
- if (tex_w != width || tex_h != height) {
- SDL_DestroyTexture(texture);
- texture = NULL;
- }
- }
-
- if (texture == NULL) {
- texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
- SDL_TEXTUREACCESS_STREAMING, width, height);
- }
-
- SDL_UpdateYUVTexture(texture, NULL,
- dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y
- dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U
- dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V
- );
-
- rd_priv_ctx->tex = texture;
- SDL_UnlockMutex(rd_priv_ctx->lock);
- return 0;
-}
-
-static const Dav1dPlayRenderInfo renderer_info = {
- .create_renderer = sdl_renderer_create,
- .destroy_renderer = sdl_renderer_destroy,
- .render = sdl_render,
- .update_frame = sdl_update_texture
-};
-
-#endif
+// Selected renderer callbacks and cookie
+static const Dav1dPlayRenderInfo *renderer_info = { NULL };
/**
* Render context structure
@@ -722,8 +53,6 @@ typedef struct render_context
Dav1dPlaySettings settings;
Dav1dSettings lib_settings;
- // Renderer callbacks
- Dav1dPlayRenderInfo *renderer_info;
// Renderer private data (passed to callbacks)
void *rd_priv;
@@ -768,7 +97,9 @@ static void dp_settings_print_usage(const char *const app,
" --tilethreads $num: number of tile threads (default: 1)\n"
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
- " --version/-v: print version and exit\n");
+ " --gpugrain/-g: enable GPU grain synthesis\n"
+ " --version/-v: print version and exit\n"
+ " --renderer/-r: select renderer backend (default: auto)\n");
exit(1);
}
@@ -791,7 +122,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
// Short options
- static const char short_opts[] = "i:vuz";
+ static const char short_opts[] = "i:vuzgr:";
enum {
ARG_FRAME_THREADS = 256,
@@ -808,6 +139,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
+ { "gpugrain", 0, NULL, 'g' },
+ { "renderer", 0, NULL, 'r'},
{ NULL, 0, NULL, 0 },
};
@@ -824,15 +157,15 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
break;
case ARG_HIGH_QUALITY:
settings->highquality = true;
-#ifndef HAVE_PLACEBO_VULKAN
- fprintf(stderr, "warning: --highquality requires libplacebo\n");
-#endif
break;
case 'z':
settings->zerocopy = true;
-#ifndef HAVE_PLACEBO_VULKAN
- fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
-#endif
+ break;
+ case 'g':
+ settings->gpugrain = true;
+ break;
+ case 'r':
+ settings->renderer_name = optarg;
break;
case ARG_FRAME_THREADS:
lib_settings->n_frame_threads =
@@ -852,6 +185,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
"Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
if (!settings->inputfile)
dp_settings_print_usage(argv[0], "Input file (-i/--input) is required");
+ if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0)
+ settings->renderer_name = NULL;
}
/**
@@ -861,7 +196,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx)
{
assert(rd_ctx != NULL);
- renderer_info.destroy_renderer(rd_ctx->rd_priv);
+ renderer_info->destroy_renderer(rd_ctx->rd_priv);
dp_fifo_destroy(rd_ctx->fifo);
SDL_DestroyMutex(rd_ctx->lock);
free(rd_ctx);
@@ -873,7 +208,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx)
* \note The Dav1dPlayRenderContext must be destroyed
* again by using dp_rd_ctx_destroy.
*/
-static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
+static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
{
Dav1dPlayRenderContext *rd_ctx;
@@ -907,7 +242,22 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
return NULL;
}
- rd_ctx->rd_priv = renderer_info.create_renderer(rd_data);
+ // Parse and validate arguments
+ dav1d_default_settings(&rd_ctx->lib_settings);
+ memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
+ dp_rd_ctx_parse_args(rd_ctx, argc, argv);
+
+ // Select renderer
+ renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name);
+
+ if (renderer_info == NULL) {
+ printf("No suitable rendered matching %s found.\n",
+ (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto");
+ } else {
+ printf("Using %s renderer\n", renderer_info->name);
+ }
+
+ rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL;
if (rd_ctx->rd_priv == NULL) {
SDL_DestroyMutex(rd_ctx->lock);
dp_fifo_destroy(rd_ctx->fifo);
@@ -915,9 +265,6 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
return NULL;
}
- dav1d_default_settings(&rd_ctx->lib_settings);
- memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
-
rd_ctx->last_pts = 0;
rd_ctx->last_ticks = 0;
rd_ctx->current_pts = 0;
@@ -949,7 +296,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
Dav1dPicture *dav1d_pic)
{
- renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
+ renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
rd_ctx->current_pts = dav1d_pic->m.timestamp;
}
@@ -1004,7 +351,7 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
}
- renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);
+ renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);
rd_ctx->last_ticks = SDL_GetTicks();
}
@@ -1152,7 +499,6 @@ cleanup:
int main(int argc, char **argv)
{
SDL_Thread *decoder_thread;
- SDL_Window *win = NULL;
// Check for version mismatch between library and tool
const char *version = dav1d_version();
@@ -1166,34 +512,30 @@ int main(int argc, char **argv)
if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0)
return 10;
- // Create Window and Renderer
- int window_flags = SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI;
-#ifdef HAVE_PLACEBO_VULKAN
- window_flags |= SDL_WINDOW_VULKAN;
-#endif
- win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
- WINDOW_WIDTH, WINDOW_HEIGHT, window_flags);
- SDL_SetWindowResizable(win, SDL_TRUE);
-
// Create render context
- Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(win);
+ Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv);
if (rd_ctx == NULL) {
fprintf(stderr, "Failed creating render context\n");
return 5;
}
- // Parse and validate arguments
- dp_rd_ctx_parse_args(rd_ctx, argc, argv);
-
if (rd_ctx->settings.zerocopy) {
- if (renderer_info.alloc_pic) {
+ if (renderer_info->alloc_pic) {
rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
.cookie = rd_ctx->rd_priv,
- .alloc_picture_callback = renderer_info.alloc_pic,
- .release_picture_callback = renderer_info.release_pic,
+ .alloc_picture_callback = renderer_info->alloc_pic,
+ .release_picture_callback = renderer_info->release_pic,
};
} else {
- fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
+ fprintf(stderr, "--zerocopy unsupported by selected renderer\n");
+ }
+ }
+
+ if (rd_ctx->settings.gpugrain) {
+ if (renderer_info->supports_gpu_grain) {
+ rd_ctx->lib_settings.apply_grain = 0;
+ } else {
+ fprintf(stderr, "--gpugrain unsupported by selected renderer\n");
}
}
@@ -1207,6 +549,10 @@ int main(int argc, char **argv)
if (SDL_WaitEvent(&e)) {
if (e.type == SDL_QUIT) {
dp_rd_ctx_request_shutdown(rd_ctx);
+ } else if (e.type == SDL_WINDOWEVENT) {
+ if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+ // TODO: Handle window resizes
+ }
} else if (e.type == rd_ctx->renderer_event_type) {
if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
// Dequeue frame and update the render context with it
@@ -1232,7 +578,6 @@ int main(int argc, char **argv)
SDL_WaitThread(decoder_thread, &decoder_ret);
dp_rd_ctx_destroy(rd_ctx);
- SDL_DestroyWindow(win);
return decoder_ret;
}
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c
new file mode 100644
index 00000000000..243d2e933bc
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <SDL.h>
+#include <assert.h>
+
+#include "dp_fifo.h"
+
+// FIFO structure
+struct dp_fifo
+{
+ SDL_mutex *lock;
+ SDL_cond *cond_change;
+ size_t capacity;
+ size_t count;
+ void **entries;
+};
+
+
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
+{
+ Dav1dPlayPtrFifo *fifo;
+
+ assert(capacity > 0);
+ if (capacity <= 0)
+ return NULL;
+
+ fifo = malloc(sizeof(*fifo));
+ if (fifo == NULL)
+ return NULL;
+
+ fifo->capacity = capacity;
+ fifo->count = 0;
+
+ fifo->lock = SDL_CreateMutex();
+ if (fifo->lock == NULL) {
+ free(fifo);
+ return NULL;
+ }
+ fifo->cond_change = SDL_CreateCond();
+ if (fifo->cond_change == NULL) {
+ SDL_DestroyMutex(fifo->lock);
+ free(fifo);
+ return NULL;
+ }
+
+ fifo->entries = calloc(capacity, sizeof(void*));
+ if (fifo->entries == NULL) {
+ dp_fifo_destroy(fifo);
+ return NULL;
+ }
+
+ return fifo;
+}
+
+// Destroy FIFO
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
+{
+ assert(fifo->count == 0);
+ SDL_DestroyMutex(fifo->lock);
+ SDL_DestroyCond(fifo->cond_change);
+ free(fifo->entries);
+ free(fifo);
+}
+
+// Push to FIFO
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
+{
+ SDL_LockMutex(fifo->lock);
+ while (fifo->count == fifo->capacity)
+ SDL_CondWait(fifo->cond_change, fifo->lock);
+ fifo->entries[fifo->count++] = element;
+ if (fifo->count == 1)
+ SDL_CondSignal(fifo->cond_change);
+ SDL_UnlockMutex(fifo->lock);
+}
+
+// Helper that shifts the FIFO array
+static void *dp_fifo_array_shift(void **arr, size_t len)
+{
+ void *shifted_element = arr[0];
+ for (size_t i = 1; i < len; ++i)
+ arr[i-1] = arr[i];
+ return shifted_element;
+}
+
+// Get item from FIFO
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
+{
+ SDL_LockMutex(fifo->lock);
+ while (fifo->count == 0)
+ SDL_CondWait(fifo->cond_change, fifo->lock);
+ void *res = dp_fifo_array_shift(fifo->entries, fifo->count--);
+ if (fifo->count == fifo->capacity - 1)
+ SDL_CondSignal(fifo->cond_change);
+ SDL_UnlockMutex(fifo->lock);
+ return res;
+}
+
+
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h
new file mode 100644
index 00000000000..a94b089b20c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Dav1dPlay FIFO helper
+ */
+
+typedef struct dp_fifo Dav1dPlayPtrFifo;
+
+/* Create a FIFO
+ *
+ * Creates a FIFO with the given capacity.
+ * If the capacity is reached, new inserts into the FIFO
+ * will block until enough space is available again.
+ */
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity);
+
+/* Destroy a FIFO
+ *
+ * The FIFO must be empty before it is destroyed!
+ */
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo);
+
+/* Shift FIFO
+ *
+ * Return the first item from the FIFO, thereby removing it from
+ * the FIFO and making room for new entries.
+ */
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
+
+/* Push to FIFO
+ *
+ * Add an item to the end of the FIFO.
+ * If the FIFO is full, this call will block until there is again enough
+ * space in the FIFO, so calling this from the "consumer" thread if no
+ * other thread will call dp_fifo_shift will lead to a deadlock.
+ */
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h
new file mode 100644
index 00000000000..4c6f2954f7a
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "dav1d/dav1d.h"
+
+#include <SDL.h>
+#ifdef HAVE_PLACEBO
+# include <libplacebo/config.h>
+#endif
+
+// Check libplacebo Vulkan rendering
+#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
+# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
+# define HAVE_RENDERER_PLACEBO
+# define HAVE_PLACEBO_VULKAN
+# endif
+#endif
+
+// Check libplacebo OpenGL rendering
+#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
+# define HAVE_RENDERER_PLACEBO
+# define HAVE_PLACEBO_OPENGL
+#endif
+
+/**
+ * Settings structure
+ * Hold all settings available for the player,
+ * this is usually filled by parsing arguments
+ * from the console.
+ */
+typedef struct {
+ const char *inputfile;
+ const char *renderer_name;
+ int highquality;
+ int untimed;
+ int zerocopy;
+ int gpugrain;
+} Dav1dPlaySettings;
+
+#define WINDOW_WIDTH 910
+#define WINDOW_HEIGHT 512
+
+#define DAV1D_EVENT_NEW_FRAME 1
+#define DAV1D_EVENT_DEC_QUIT 2
+
+/**
+ * Renderer info
+ */
+typedef struct rdr_info
+{
+ // Renderer name
+ const char *name;
+ // Cookie passed to the renderer implementation callbacks
+ void *cookie;
+ // Callback to create the renderer
+ void* (*create_renderer)();
+ // Callback to destroy the renderer
+ void (*destroy_renderer)(void *cookie);
+ // Callback to the render function that renders a prevously sent frame
+ void (*render)(void *cookie, const Dav1dPlaySettings *settings);
+ // Callback to the send frame function
+ int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
+ const Dav1dPlaySettings *settings);
+ // Callback for alloc/release pictures (optional)
+ int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
+ void (*release_pic)(Dav1dPicture *pic, void *cookie);
+ // Whether or not this renderer can apply on-GPU film grain synthesis
+ int supports_gpu_grain;
+} Dav1dPlayRenderInfo;
+
+extern const Dav1dPlayRenderInfo rdr_placebo_vk;
+extern const Dav1dPlayRenderInfo rdr_placebo_gl;
+extern const Dav1dPlayRenderInfo rdr_sdl;
+
+// Available renderes ordered by priority
+static const Dav1dPlayRenderInfo* const dp_renderers[] = {
+ &rdr_placebo_vk,
+ &rdr_placebo_gl,
+ &rdr_sdl,
+};
+
+static inline const Dav1dPlayRenderInfo *dp_get_renderer(const char *name)
+{
+ for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i)
+ {
+ if (dp_renderers[i]->name == NULL)
+ continue;
+
+ if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) {
+ return dp_renderers[i];
+ }
+ }
+ return NULL;
+}
+
+static inline SDL_Window *dp_create_sdl_window(int window_flags)
+{
+ SDL_Window *win;
+ window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI;
+
+ win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
+ WINDOW_WIDTH, WINDOW_HEIGHT, window_flags);
+ SDL_SetWindowResizable(win, SDL_TRUE);
+
+ return win;
+}
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c
new file mode 100644
index 00000000000..beb1d42ad72
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#ifdef HAVE_RENDERER_PLACEBO
+#include <assert.h>
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/upload.h>
+
+#ifdef HAVE_PLACEBO_VULKAN
+# include <libplacebo/vulkan.h>
+# include <SDL_vulkan.h>
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+# include <libplacebo/opengl.h>
+# include <SDL_opengl.h>
+#endif
+
+
+/**
+ * Renderer context for libplacebo
+ */
+typedef struct renderer_priv_ctx
+{
+ // SDL window
+ SDL_Window *win;
+ // Placebo context
+ struct pl_context *ctx;
+ // Placebo renderer
+ struct pl_renderer *renderer;
+#ifdef HAVE_PLACEBO_VULKAN
+ // Placebo Vulkan handle
+ const struct pl_vulkan *vk;
+ // Placebo Vulkan instance
+ const struct pl_vk_inst *vk_inst;
+ // Vulkan surface
+ VkSurfaceKHR surf;
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+ // Placebo OpenGL handle
+ const struct pl_opengl *gl;
+#endif
+ // Placebo GPU
+ const struct pl_gpu *gpu;
+ // Placebo swapchain
+ const struct pl_swapchain *swapchain;
+ // Lock protecting access to the texture
+ SDL_mutex *lock;
+ // Image to render, and planes backing them
+ struct pl_image image;
+ const struct pl_tex *plane_tex[3];
+} Dav1dPlayRendererPrivateContext;
+
+static Dav1dPlayRendererPrivateContext*
+ placebo_renderer_create_common(int window_flags)
+{
+ // Create Window
+ SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE);
+ if (sdlwin == NULL)
+ return NULL;
+
+ // Alloc
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+ if (rd_priv_ctx == NULL) {
+ return NULL;
+ }
+
+ *rd_priv_ctx = (Dav1dPlayRendererPrivateContext) {0};
+ rd_priv_ctx->win = sdlwin;
+
+ // Init libplacebo
+ rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
+ .log_cb = pl_log_color,
+#ifndef NDEBUG
+ .log_level = PL_LOG_DEBUG,
+#else
+ .log_level = PL_LOG_WARN,
+#endif
+ });
+ if (rd_priv_ctx->ctx == NULL) {
+ free(rd_priv_ctx);
+ return NULL;
+ }
+
+ // Create Mutex
+ rd_priv_ctx->lock = SDL_CreateMutex();
+ if (rd_priv_ctx->lock == NULL) {
+ fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+ pl_context_destroy(&(rd_priv_ctx->ctx));
+ free(rd_priv_ctx);
+ return NULL;
+ }
+
+ return rd_priv_ctx;
+}
+
+#ifdef HAVE_PLACEBO_OPENGL
+static void *placebo_renderer_create_gl()
+{
+ SDL_Window *sdlwin = NULL;
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+
+ // Common init
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+ placebo_renderer_create_common(SDL_WINDOW_OPENGL);
+
+ if (rd_priv_ctx == NULL)
+ return NULL;
+ sdlwin = rd_priv_ctx->win;
+
+ // Init OpenGL
+ struct pl_opengl_params params = pl_opengl_default_params;
+# ifndef NDEBUG
+ params.debug = true;
+# endif
+
+ SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin);
+ SDL_GL_MakeCurrent(sdlwin, glcontext);
+
+ rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->ctx, &params);
+ if (!rd_priv_ctx->gl) {
+ fprintf(stderr, "Failed creating opengl device!\n");
+ exit(2);
+ }
+
+ rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl,
+ &(struct pl_opengl_swapchain_params) {
+ .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow,
+ .priv = sdlwin,
+ });
+
+ if (!rd_priv_ctx->swapchain) {
+ fprintf(stderr, "Failed creating opengl swapchain!\n");
+ exit(2);
+ }
+
+ int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+ SDL_GL_GetDrawableSize(sdlwin, &w, &h);
+
+ if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+ fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+ exit(2);
+ }
+
+ rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu;
+
+ if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+ printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+ return rd_priv_ctx;
+}
+#endif
+
+#ifdef HAVE_PLACEBO_VULKAN
+static void *placebo_renderer_create_vk()
+{
+ SDL_Window *sdlwin = NULL;
+
+ // Common init
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+ placebo_renderer_create_common(SDL_WINDOW_VULKAN);
+
+ if (rd_priv_ctx == NULL)
+ return NULL;
+ sdlwin = rd_priv_ctx->win;
+
+ // Init Vulkan
+ unsigned num = 0;
+ if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) {
+ fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError());
+ exit(1);
+ }
+
+ const char **extensions = malloc(num * sizeof(const char *));
+ assert(extensions);
+
+ SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions);
+ if (!ok) {
+ fprintf(stderr, "Failed getting Vk instance extensions\n");
+ exit(1);
+ }
+
+ if (num > 0) {
+ printf("Requesting %d additional Vulkan extensions:\n", num);
+ for (unsigned i = 0; i < num; i++)
+ printf(" %s\n", extensions[i]);
+ }
+
+ struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
+ iparams.extensions = extensions;
+ iparams.num_extensions = num;
+
+ rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams);
+ if (!rd_priv_ctx->vk_inst) {
+ fprintf(stderr, "Failed creating Vulkan instance!\n");
+ exit(1);
+ }
+ free(extensions);
+
+ if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) {
+ fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError());
+ exit(1);
+ }
+
+ struct pl_vulkan_params params = pl_vulkan_default_params;
+ params.instance = rd_priv_ctx->vk_inst->instance;
+ params.surface = rd_priv_ctx->surf;
+ params.allow_software = true;
+
+ rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, &params);
+ if (!rd_priv_ctx->vk) {
+ fprintf(stderr, "Failed creating vulkan device!\n");
+ exit(2);
+ }
+
+ // Create swapchain
+ rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk,
+ &(struct pl_vulkan_swapchain_params) {
+ .surface = rd_priv_ctx->surf,
+ .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR,
+ });
+
+ if (!rd_priv_ctx->swapchain) {
+ fprintf(stderr, "Failed creating vulkan swapchain!\n");
+ exit(2);
+ }
+
+ int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+ if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+ fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+ exit(2);
+ }
+
+ rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu;
+
+ if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+ printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+ return rd_priv_ctx;
+}
+#endif
+
+static void placebo_renderer_destroy(void *cookie)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ pl_renderer_destroy(&(rd_priv_ctx->renderer));
+ pl_swapchain_destroy(&(rd_priv_ctx->swapchain));
+ for (int i = 0; i < 3; i++)
+ pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));
+
+#ifdef HAVE_PLACEBO_VULKAN
+ if (rd_priv_ctx->vk) {
+ pl_vulkan_destroy(&(rd_priv_ctx->vk));
+ vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
+ pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
+ }
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+ if (rd_priv_ctx->gl)
+ pl_opengl_destroy(&(rd_priv_ctx->gl));
+#endif
+
+ SDL_DestroyWindow(rd_priv_ctx->win);
+
+ pl_context_destroy(&(rd_priv_ctx->ctx));
+}
+
+static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ SDL_LockMutex(rd_priv_ctx->lock);
+ if (!rd_priv_ctx->image.num_planes) {
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return;
+ }
+
+ // Prepare rendering
+ if (rd_priv_ctx->renderer == NULL) {
+ rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->gpu);
+ }
+
+ struct pl_swapchain_frame frame;
+ bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame);
+ if (!ok) {
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return;
+ }
+
+ struct pl_render_params render_params = {0};
+ if (settings->highquality)
+ render_params = pl_render_default_params;
+
+ struct pl_render_target target;
+ pl_render_target_from_swapchain(&target, &frame);
+ target.profile = (struct pl_icc_profile) {
+ .data = NULL,
+ .len = 0,
+ };
+
+#if PL_API_VER >= 66
+ pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
+ if (pl_render_target_partial(&target))
+ pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
+#endif
+
+ if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
+ fprintf(stderr, "Failed rendering frame!\n");
+ pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
+ }
+
+ ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
+ if (!ok) {
+ fprintf(stderr, "Failed submitting frame!\n");
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return;
+ }
+
+ pl_swapchain_swap_buffers(rd_priv_ctx->swapchain);
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
+ const Dav1dPlaySettings *settings)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ SDL_LockMutex(rd_priv_ctx->lock);
+
+ if (dav1d_pic == NULL) {
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return 0;
+ }
+
+ int width = dav1d_pic->p.w;
+ int height = dav1d_pic->p.h;
+ int sub_x = 0, sub_y = 0;
+ int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
+ enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
+
+ struct pl_image *image = &rd_priv_ctx->image;
+ *image = (struct pl_image) {
+ .num_planes = 3,
+ .width = width,
+ .height = height,
+ .src_rect = {0, 0, width, height},
+
+ .repr = {
+ .bits = {
+ .sample_depth = bytes * 8,
+ .color_depth = dav1d_pic->p.bpc,
+ },
+ },
+ };
+
+ // Figure out the correct plane dimensions/count
+ switch (dav1d_pic->p.layout) {
+ case DAV1D_PIXEL_LAYOUT_I400:
+ image->num_planes = 1;
+ break;
+ case DAV1D_PIXEL_LAYOUT_I420:
+ sub_x = sub_y = 1;
+ break;
+ case DAV1D_PIXEL_LAYOUT_I422:
+ sub_x = 1;
+ break;
+ case DAV1D_PIXEL_LAYOUT_I444:
+ break;
+ }
+
+ // Set the right colorspace metadata etc.
+ switch (dav1d_pic->seq_hdr->pri) {
+ case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
+ case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break;
+ case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
+ case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+ case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+ case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
+
+ case DAV1D_COLOR_PRI_XYZ:
+ // Handled below
+ assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
+ break;
+
+ default:
+ printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
+ "may be very incorrect\n", dav1d_pic->seq_hdr->pri);
+ break;
+ }
+
+ switch (dav1d_pic->seq_hdr->trc) {
+ case DAV1D_TRC_BT709:
+ case DAV1D_TRC_BT470M:
+ case DAV1D_TRC_BT470BG:
+ case DAV1D_TRC_BT601:
+ case DAV1D_TRC_SMPTE240:
+ case DAV1D_TRC_BT2020_10BIT:
+ case DAV1D_TRC_BT2020_12BIT:
+ // These all map to the effective "SDR" CRT-based EOTF, BT.1886
+ image->color.transfer = PL_COLOR_TRC_BT_1886;
+ break;
+
+ case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
+ case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break;
+ case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break;
+ case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break;
+ case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break;
+
+ default:
+ printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
+ "may be very incorrect\n", dav1d_pic->seq_hdr->trc);
+ break;
+ }
+
+ switch (dav1d_pic->seq_hdr->mtrx) {
+ case DAV1D_MC_IDENTITY:
+ // This is going to be either RGB or XYZ
+ if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
+ image->repr.sys = PL_COLOR_SYSTEM_XYZ;
+ } else {
+ image->repr.sys = PL_COLOR_SYSTEM_RGB;
+ }
+ break;
+
+ case DAV1D_MC_UNKNOWN:
+ // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
+ image->repr.sys = pl_color_system_guess_ycbcr(width, height);
+ break;
+
+ case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
+ case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
+ case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
+ case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
+ case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
+ case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
+
+ case DAV1D_MC_ICTCP:
+ // This one is split up based on the actual HDR curve in use
+ if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
+ image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+ } else {
+ image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
+ }
+ break;
+
+ default:
+ printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
+ "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
+ break;
+ }
+
+ if (dav1d_pic->seq_hdr->color_range) {
+ image->repr.levels = PL_COLOR_LEVELS_PC;
+ } else {
+ image->repr.levels = PL_COLOR_LEVELS_TV;
+ }
+
+ switch (dav1d_pic->seq_hdr->chr) {
+ case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break;
+ case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break;
+ case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break;
+ }
+
+#if PL_API_VER >= 63
+ if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
+ Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
+ struct pl_av1_grain_data *dst = &image->av1_grain;
+ *dst = (struct pl_av1_grain_data) {
+ .grain_seed = src->seed,
+ .num_points_y = src->num_y_points,
+ .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
+ .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
+ .scaling_shift = src->scaling_shift,
+ .ar_coeff_lag = src->ar_coeff_lag,
+ .ar_coeff_shift = src->ar_coeff_shift,
+ .grain_scale_shift = src->grain_scale_shift,
+ .uv_mult = { src->uv_mult[0], src->uv_mult[1] },
+ .uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
+ .uv_offset = { src->uv_offset[0], src->uv_offset[1] },
+ .overlap = src->overlap_flag,
+ };
+
+ assert(sizeof(dst->points_y) == sizeof(src->y_points));
+ assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
+ assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
+ memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
+ memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
+ memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
+
+ // this one has different row sizes for alignment
+ for (int c = 0; c < 2; c++) {
+ for (int i = 0; i < 25; i++)
+ dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
+ }
+ }
+#endif
+
+ // Upload the actual planes
+ struct pl_plane_data data[3] = {
+ {
+ // Y plane
+ .type = PL_FMT_UNORM,
+ .width = width,
+ .height = height,
+ .pixel_stride = bytes,
+ .row_stride = dav1d_pic->stride[0],
+ .component_size = {bytes * 8},
+ .component_map = {0},
+ }, {
+ // U plane
+ .type = PL_FMT_UNORM,
+ .width = width >> sub_x,
+ .height = height >> sub_y,
+ .pixel_stride = bytes,
+ .row_stride = dav1d_pic->stride[1],
+ .component_size = {bytes * 8},
+ .component_map = {1},
+ }, {
+ // V plane
+ .type = PL_FMT_UNORM,
+ .width = width >> sub_x,
+ .height = height >> sub_y,
+ .pixel_stride = bytes,
+ .row_stride = dav1d_pic->stride[1],
+ .component_size = {bytes * 8},
+ .component_map = {2},
+ },
+ };
+
+ bool ok = true;
+
+ for (int i = 0; i < image->num_planes; i++) {
+ if (settings->zerocopy) {
+ const struct pl_buf *buf = dav1d_pic->allocator_data;
+ assert(buf);
+ data[i].buf = buf;
+ data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
+ } else {
+ data[i].pixels = dav1d_pic->data[i];
+ }
+
+ ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
+ }
+
+ // Apply the correct chroma plane shift. This has to be done after pl_upload_plane
+#if PL_API_VER >= 67
+ pl_image_set_chroma_location(image, chroma_loc);
+#else
+ pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
+ pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
+#endif
+
+ if (!ok) {
+ fprintf(stderr, "Failed uploading planes!\n");
+ *image = (struct pl_image) {0};
+ }
+
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return !ok;
+}
+
+// Align to power of 2
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+ SDL_LockMutex(rd_priv_ctx->lock);
+
+ const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+ int ret = DAV1D_ERR(ENOMEM);
+
+ // Copied from dav1d_default_picture_alloc
+ const int hbd = p->p.bpc > 8;
+ const int aligned_w = ALIGN2(p->p.w, 128);
+ const int aligned_h = ALIGN2(p->p.h, 128);
+ const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ p->stride[0] = aligned_w << hbd;
+ p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+ // Align strides up to multiples of the GPU performance hints
+ p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
+ p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
+
+ // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
+ size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+ const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
+ const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+ // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+ // even in the case that the driver gives us insane alignments
+ const size_t pic_size = y_sz + 2 * uv_sz;
+ const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+ // Validate size limitations
+ if (total_size > gpu->limits.max_xfer_size) {
+ printf("alloc of %zu bytes exceeds limits\n", total_size);
+ goto err;
+ }
+
+ const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+ .type = PL_BUF_TEX_TRANSFER,
+ .host_mapped = true,
+ .size = total_size,
+ .memory_type = PL_BUF_MEM_HOST,
+ .user_data = p,
+ });
+
+ if (!buf) {
+ printf("alloc of GPU mapped buffer failed\n");
+ goto err;
+ }
+
+ assert(buf->data);
+ uintptr_t base = (uintptr_t) buf->data, data[3];
+ data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+ data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+ data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+ // Sanity check offset alignment for the sake of debugging
+ if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
+ data[1] - base != ALIGN2(data[1] - base, off_align) ||
+ data[2] - base != ALIGN2(data[2] - base, off_align))
+ {
+ printf("GPU buffer horribly misaligned, expect slowdown!\n");
+ }
+
+ p->allocator_data = (void *) buf;
+ p->data[0] = (void *) data[0];
+ p->data[1] = (void *) data[1];
+ p->data[2] = (void *) data[2];
+ ret = 0;
+
+ // fall through
+err:
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return ret;
+}
+
+static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+ assert(pic->allocator_data);
+
+ SDL_LockMutex(rd_priv_ctx->lock);
+ const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+ pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+#ifdef HAVE_PLACEBO_VULKAN
+const Dav1dPlayRenderInfo rdr_placebo_vk = {
+ .name = "placebo-vk",
+ .create_renderer = placebo_renderer_create_vk,
+ .destroy_renderer = placebo_renderer_destroy,
+ .render = placebo_render,
+ .update_frame = placebo_upload_image,
+ .alloc_pic = placebo_alloc_pic,
+ .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+ .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+#endif
+
+#ifdef HAVE_PLACEBO_OPENGL
+const Dav1dPlayRenderInfo rdr_placebo_gl = {
+ .name = "placebo-gl",
+ .create_renderer = placebo_renderer_create_gl,
+ .destroy_renderer = placebo_renderer_destroy,
+ .render = placebo_render,
+ .update_frame = placebo_upload_image,
+ .alloc_pic = placebo_alloc_pic,
+ .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+ .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
+
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c
new file mode 100644
index 00000000000..078d6134921
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#include <assert.h>
+
+/**
+ * Renderer context for SDL
+ */
+typedef struct renderer_priv_ctx
+{
+ // SDL window
+ SDL_Window *win;
+ // SDL renderer
+ SDL_Renderer *renderer;
+ // Lock protecting access to the texture
+ SDL_mutex *lock;
+ // Texture to render
+ SDL_Texture *tex;
+} Dav1dPlayRendererPrivateContext;
+
+static void *sdl_renderer_create()
+{
+ SDL_Window *win = dp_create_sdl_window(0);
+ if (win == NULL)
+ return NULL;
+
+ // Alloc
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+ if (rd_priv_ctx == NULL) {
+ return NULL;
+ }
+ rd_priv_ctx->win = win;
+
+ // Create renderer
+ rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
+ // Set scale quality
+ SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear");
+
+ // Create Mutex
+ rd_priv_ctx->lock = SDL_CreateMutex();
+ if (rd_priv_ctx->lock == NULL) {
+ fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+ free(rd_priv_ctx);
+ return NULL;
+ }
+
+ rd_priv_ctx->tex = NULL;
+
+ return rd_priv_ctx;
+}
+
+static void sdl_renderer_destroy(void *cookie)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ SDL_DestroyRenderer(rd_priv_ctx->renderer);
+ SDL_DestroyMutex(rd_priv_ctx->lock);
+ free(rd_priv_ctx);
+}
+
+static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ SDL_LockMutex(rd_priv_ctx->lock);
+
+ if (rd_priv_ctx->tex == NULL) {
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return;
+ }
+
+ // Display the frame
+ SDL_RenderClear(rd_priv_ctx->renderer);
+ SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL);
+ SDL_RenderPresent(rd_priv_ctx->renderer);
+
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
+ const Dav1dPlaySettings *settings)
+{
+ Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+ assert(rd_priv_ctx != NULL);
+
+ SDL_LockMutex(rd_priv_ctx->lock);
+
+ if (dav1d_pic == NULL) {
+ rd_priv_ctx->tex = NULL;
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return 0;
+ }
+
+ int width = dav1d_pic->p.w;
+ int height = dav1d_pic->p.h;
+ int tex_w = width;
+ int tex_h = height;
+
+ enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
+
+ if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
+ fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
+ exit(50);
+ }
+
+ SDL_Texture *texture = rd_priv_ctx->tex;
+ if (texture != NULL) {
+ SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h);
+ if (tex_w != width || tex_h != height) {
+ SDL_DestroyTexture(texture);
+ texture = NULL;
+ }
+ }
+
+ if (texture == NULL) {
+ texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
+ SDL_TEXTUREACCESS_STREAMING, width, height);
+ }
+
+ SDL_UpdateYUVTexture(texture, NULL,
+ dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y
+ dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U
+ dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V
+ );
+
+ rd_priv_ctx->tex = texture;
+ SDL_UnlockMutex(rd_priv_ctx->lock);
+ return 0;
+}
+
+const Dav1dPlayRenderInfo rdr_sdl = {
+ .name = "sdl",
+ .create_renderer = sdl_renderer_create,
+ .destroy_renderer = sdl_renderer_destroy,
+ .render = sdl_render,
+ .update_frame = sdl_update_texture
+};
diff --git a/chromium/third_party/dav1d/libdav1d/examples/meson.build b/chromium/third_party/dav1d/libdav1d/examples/meson.build
index bad1d902ed3..50e097a8df6 100644
--- a/chromium/third_party/dav1d/libdav1d/examples/meson.build
+++ b/chromium/third_party/dav1d/libdav1d/examples/meson.build
@@ -35,28 +35,40 @@ endif
# dav1d player sources
dav1dplay_sources = files(
'dav1dplay.c',
+ 'dp_fifo.c',
+ 'dp_renderer_placebo.c',
+ 'dp_renderer_sdl.c',
)
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)
if sdl2_dependency.found()
+ dav1dplay_deps = [sdl2_dependency]
+ dav1dplay_cflags = []
+
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
- vulkan_dependency = dependency('vulkan', required: false)
- sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
- cflag_placebo = []
- deps_placebo = []
- if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
- cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
- deps_placebo = [vulkan_dependency, placebo_dependency]
+
+ if placebo_dependency.found()
+ dav1dplay_deps += placebo_dependency
+ dav1dplay_cflags += '-DHAVE_PLACEBO'
+
+ # If libplacebo is found, we might be able to use Vulkan
+ # with it, in which case we need the Vulkan library too.
+ vulkan_dependency = dependency('vulkan', required: false)
+ if vulkan_dependency.found()
+ dav1dplay_deps += vulkan_dependency
+ dav1dplay_cflags += '-DHAVE_VULKAN'
+ endif
endif
+
dav1dplay = executable('dav1dplay',
dav1dplay_sources,
rev_target,
link_with : [libdav1d, dav1d_input_objs],
include_directories : [dav1d_inc_dirs],
- dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
+ dependencies : [getopt_dependency, dav1dplay_deps],
install : true,
- c_args : cflag_placebo,
+ c_args : dav1dplay_cflags,
)
endif
diff --git a/chromium/third_party/dav1d/libdav1d/gcovr.cfg b/chromium/third_party/dav1d/libdav1d/gcovr.cfg
new file mode 100644
index 00000000000..f768de8a656
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/gcovr.cfg
@@ -0,0 +1,3 @@
+exclude = .*/tests/.*
+exclude = .*/tools/.*
+exclude = .*/include/common/dump.h
diff --git a/chromium/third_party/dav1d/libdav1d/meson.build b/chromium/third_party/dav1d/libdav1d/meson.build
index b575601e556..d5366f9a7c4 100644
--- a/chromium/third_party/dav1d/libdav1d/meson.build
+++ b/chromium/third_party/dav1d/libdav1d/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2018-2019, VideoLAN and dav1d authors
+# Copyright © 2018-2020, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
- version: '0.6.0',
+ version: '0.7.1',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
-dav1d_soname_version = '4.0.0'
+dav1d_soname_version = '4.0.2'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@@ -196,10 +196,10 @@ else
getopt_dependency = []
endif
-if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
- cdata.set('HAVE_POSIX_MEMALIGN', 1)
-elif cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
+if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
cdata.set('HAVE_ALIGNED_MALLOC', 1)
+elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+ cdata.set('HAVE_POSIX_MEMALIGN', 1)
elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
cdata.set('HAVE_MEMALIGN', 1)
endif
@@ -415,7 +415,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
depfile: '@BASENAME@.obj.ndep',
arguments: [
'-f', nasm_format,
- '-I', '@0@/src/'.format(meson.current_source_dir()),
+ '-I', '@0@/src/'.format(dav1d_src_root),
'-I', '@0@/'.format(meson.current_build_dir()),
'-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
'@EXTRA_ARGS@',
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S
new file mode 100644
index 00000000000..867eb194df9
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S
@@ -0,0 +1,3386 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlal.s16 \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlsl.s16 \d1, \s3, \c1
+.endm
+
+.macro vrshrn_8h d0, d1, s0, s1, shift
+ vrshrn.i32 \d0, \s0, \shift
+ vrshrn.i32 \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s16 \r0, \r0, \c
+ vqrdmulh.s16 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s16 \r2, \r2, \c
+ vqrdmulh.s16 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s16 \r4, \r4, \c
+ vqrdmulh.s16 \r5, \r5, \c
+ vqrdmulh.s16 \r6, \r6, \c
+ vqrdmulh.s16 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.8 {\load}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.8 {\store}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store d3, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits
+ load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits
+ load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits
+ load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits
+ load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits
+ load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits
+ load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits
+ load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits
+ load_add_store , , , , , , d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src
+ load_add_store d3, q9, , , , , , \dst, \src
+ load_add_store d4, q10, d2, q8, , , , \dst, \src
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src
+ load_add_store , , d4, q10, q9, d3, d2, \dst, \src
+ load_add_store , , d5, q11, q10, d4, d3, \dst, \src
+ load_add_store , , , , q11, d5, d4, \dst, \src
+ load_add_store , , , , , , d5, \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ vld1.32 {\load[0]}, [\src, :32], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #4
+.endif
+.ifnb \load
+ vld1.32 {\load[1]}, [\src, :32], r1
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[0]}, [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[1]}, [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src
+ load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src
+ load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src
+ load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src
+ load_add_store4 , , , , q15, d7, d6, \dst, \src
+ load_add_store4 , , , , , , d7, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 , , , , q11, d3, d2, \dst, \src
+ load_add_store4 , , , , , , d3, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d0, r12
+ vqrdmulh.s16 d16, d16, d0[0]
+ vst1.16 {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s16 d16, d16, d0[0]
+.endif
+.if \shift > 0
+ vrshr.s16 d16, d16, #\shift
+.endif
+ vqrdmulh.s16 d20, d16, d0[0]
+ mov r3, #\h
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ subs r3, r3, #4
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q10, q8, d0
+ vqmovun.s16 d0, q10
+ vaddw.u8 q11, q8, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q11
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+ vld1.8 {d0}, [r0, :64], r1
+ vld1.8 {d1}, [r0, :64], r1
+ vld1.8 {d2}, [r0, :64], r1
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d3}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ subs r3, r3, #4
+ vaddw.u8 q11, q8, d1
+ vqmovun.s16 d0, q10
+ vaddw.u8 q12, q8, d2
+ vqmovun.s16 d1, q11
+ vaddw.u8 q13, q8, d3
+ vst1.8 {d0}, [r0, :64], r1
+ vqmovun.s16 d2, q12
+ vst1.8 {d1}, [r0, :64], r1
+ vqmovun.s16 d3, q13
+ vst1.8 {d2}, [r0, :64], r1
+ vst1.8 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+ vld1.8 {q0}, [r0, :128], r1
+ vld1.8 {q1}, [r0, :128], r1
+ vld1.8 {q2}, [r0, :128], r1
+ subs r3, r3, #4
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vld1.8 {q3}, [r0, :128], r1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q1}, [r0, :128], r1
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+ vld1.8 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.8 {q2, q3}, [r0, :128], r1
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #1
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #32
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.8 {q2, q3}, [r0, :128]
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, #32
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128]!
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i16 d16, d16, d17
+ vsub.i16 d21, d18, d19
+ vsub.i16 d20, d16, d21
+ vshr.s16 d20, d20, #1
+ vsub.i16 d18, d20, d17
+ vsub.i16 d17, d20, d19
+ vadd.i16 d19, d21, d18
+ vsub.i16 d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+ vmull_vmlal q3, \r1, \r3, d0[3], d0[2]
+ vmull_vmlsl q2, \r1, \r3, d0[2], d0[3]
+ vmull_vmlal q1, \r0, \r2, d0[0], d0[0]
+ vrshrn.i32 d6, q3, #12
+ vrshrn.i32 d7, q2, #12
+ vmull_vmlsl q2, \r0, \r2, d0[0], d0[0]
+ vrshrn.i32 d2, q1, #12
+ vrshrn.i32 d3, q2, #12
+ vqadd.s16 \r0, d2, d6
+ vqsub.s16 \r3, d2, d6
+ vqadd.s16 \r1, d3, d7
+ vqsub.s16 \r2, d3, d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2]
+ vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3]
+ vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vrshrn_8h d12, d13, q6, q7, #12
+ vrshrn_8h d14, d15, q4, q5, #12
+ vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vrshrn_8h d4, d5, q2, q3, #12
+ vrshrn_8h d6, d7, q4, q5, #12
+ vqadd.s16 \q0, q2, q6
+ vqsub.s16 \q3, q2, q6
+ vqadd.s16 \q1, q3, q7
+ vqsub.s16 \q2, q3, q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_4h_x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q1, d16, d18
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmull.s16 q10, d17, d0[3]
+ vaddw.s16 q1, q1, d19
+ vmull.s16 q3, d16, d0[2]
+ vmlsl.s16 q3, d18, d0[0]
+ vmlsl.s16 q3, d19, d0[1]
+
+ vadd.s32 q11, q2, q3
+ vmul.s32 q1, q1, d1[0]
+ vadd.s32 q2, q2, q10
+ vadd.s32 q3, q3, q10
+ vsub.s32 q11, q11, q10
+
+ vrshrn.i32 \o0, q2, #12
+ vrshrn.i32 \o2, q1, #12
+ vrshrn.i32 \o1, q3, #12
+ vrshrn.i32 \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 d19, d18, d17, d16
+ bx lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q2, d16, d20
+ vsubl.s16 q3, d17, d21
+ vmull.s16 q4, d16, d0[0]
+ vmlal.s16 q4, d20, d0[1]
+ vmlal.s16 q4, d22, d0[2]
+ vmull.s16 q5, d17, d0[0]
+ vmlal.s16 q5, d21, d0[1]
+ vmlal.s16 q5, d23, d0[2]
+ vaddw.s16 q2, q2, d22
+ vaddw.s16 q3, q3, d23
+ vmull.s16 q6, d16, d0[2]
+ vmlsl.s16 q6, d20, d0[0]
+ vmlsl.s16 q6, d22, d0[1]
+ vmull.s16 q7, d17, d0[2]
+ vmlsl.s16 q7, d21, d0[0]
+ vmlsl.s16 q7, d23, d0[1]
+
+ vmul.s32 q10, q2, d1[0]
+ vmul.s32 q11, q3, d1[0]
+
+ vmull.s16 q2, d18, d0[3]
+ vmull.s16 q3, d19, d0[3]
+
+ vadd.s32 q8, q4, q2 // out0
+ vadd.s32 q9, q5, q3
+
+ vadd.s32 q4, q4, q6 // out3
+ vadd.s32 q5, q5, q7
+
+ vadd.s32 q6, q6, q2 // out1
+ vadd.s32 q7, q7, q3
+
+ vsub.s32 q4, q4, q2 // out3
+ vsub.s32 q5, q5, q3
+
+ vrshrn.i32 d20, q10, #12
+ vrshrn.i32 d21, q11, #12
+
+ vrshrn.i32 \o0, q8, #12
+ vrshrn.i32 \o1, q9, #12
+
+.ifc \o4, d18
+ vmov q9, q10
+.endif
+
+ vrshrn.i32 \o2, q6, #12
+ vrshrn.i32 \o3, q7, #12
+
+ vrshrn.i32 \o6, q4, #12
+ vrshrn.i32 \o7, q5, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q2, q8, d0[0]
+ vqrdmulh.s16 q3, q9, d0[0]
+ vqadd.s16 q8, q8, q2
+ vqadd.s16 q9, q9, q3
+ bx lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q1, q8, d0[0]
+ vqrdmulh.s16 q2, q9, d0[0]
+ vqrdmulh.s16 q3, q10, d0[0]
+ vqadd.s16 q8, q8, q1
+ vqrdmulh.s16 q1, q11, d0[0]
+ vqadd.s16 q9, q9, q2
+ vqadd.s16 q10, q10, q3
+ vqadd.s16 q11, q11, q1
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+ vqrdmulh.s16 q1, \i, \c
+ vrhadd.s16 \i, \i, q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ vshr.s16 q8, q8, #2
+ vshr.s16 q9, q9, #2
+
+ iwht4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ iwht4
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ blx r4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+L(itx_4x4_end):
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q8, q8, d0
+ vqmovun.s16 d0, q8
+ vaddw.u8 q9, q9, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q9
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d4, r12
+ vst1.16 {d30[0]}, [r2, :16]
+ vqrdmulh.s16 d16, d16, d4[0]
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vqrdmulh.s16 d20, d16, d4[0]
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vmov q9, q8
+ vld1.32 {d1[1]}, [r0, :32], r1
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4h_x4_neon
+ movrel_local r5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+ vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a
+ vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a
+ vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a
+ vrshrn_8h \r2, \r3, q2, q3, #12 // t4a
+ vrshrn_8h \r14, \r15, q4, q5, #12 // t7a
+ vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
+ vrshrn_8h \r6, \r7, q6, q7, #12 // t5a
+ vrshrn_8h \r10, \r11, q2, q3, #12 // taa
+
+ vqadd.s16 q2, \q1, \q3 // t4
+ vqsub.s16 \q1, \q1, \q3 // t5a
+ vqadd.s16 q3, \q7, \q5 // t7
+ vqsub.s16 \q3, \q7, \q5 // t6a
+
+ vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5
+ vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6
+ vrshrn_8h d8, d9, q4, q5, #12 // t5
+ vrshrn_8h d10, d11, q6, q7, #12 // t6
+
+ vqsub.s16 \q7, \q0, q3 // out7
+ vqadd.s16 \q0, \q0, q3 // out0
+ vqadd.s16 \q1, \q2, q5 // out1
+ vqsub.s16 q6, \q2, q5 // out6
+ vqadd.s16 \q2, \q4, q4 // out2
+ vqsub.s16 \q5, \q4, q4 // out5
+ vqadd.s16 \q3, \q6, q2 // out3
+ vqsub.s16 \q4, \q6, q2 // out4
+ vmov \q6, q6 // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4h_x4 \r0, \r2, \r4, \r6
+
+ vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a
+ vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a
+ vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a
+ vrshrn.i32 \r1, q1, #12 // t4a
+ vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a
+ vrshrn.i32 \r7, q2, #12 // t7a
+ vrshrn.i32 \r3, q3, #12 // t5a
+ vrshrn.i32 \r5, q1, #12 // taa
+
+ vqadd.s16 d2, \r1, \r3 // t4
+ vqsub.s16 \r1, \r1, \r3 // t5a
+ vqadd.s16 d3, \r7, \r5 // t7
+ vqsub.s16 \r3, \r7, \r5 // t6a
+
+ vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5
+ vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshrn.i32 d4, q2, #12 // t5
+ vrshrn.i32 d5, q3, #12 // t6
+
+ vqsub.s16 \r7, \r0, d3 // out7
+ vqadd.s16 \r0, \r0, d3 // out0
+ vqadd.s16 \r1, \r2, d5 // out1
+ vqsub.s16 d6, \r2, d5 // out6
+ vqadd.s16 \r2, \r4, d4 // out2
+ vqsub.s16 \r5, \r4, d4 // out5
+ vqadd.s16 \r3, \r6, d2 // out3
+ vqsub.s16 \r4, \r6, d2 // out4
+ vmov \r6, d6 // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1]
+ vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0]
+ vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3]
+ vrshrn_8h d16, d17, q2, q3, #12 // t0a
+ vrshrn_8h d30, d31, q4, q5, #12 // t1a
+ vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2]
+ vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1]
+ vrshrn_8h d20, d21, q6, q7, #12 // t2a
+ vrshrn_8h d26, d27, q2, q3, #12 // t3a
+ vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0]
+ vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3]
+ vrshrn_8h d24, d25, q4, q5, #12 // t4a
+ vrshrn_8h d22, d23, q6, q7, #12 // t5a
+ vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2]
+ vrshrn_8h d28, d29, q2, q3, #12 // t6a
+ vrshrn_8h d18, d19, q4, q5, #12 // t7a
+
+ vqadd.s16 q2, q8, q12 // t0
+ vqsub.s16 q3, q8, q12 // t4
+ vqadd.s16 q4, q15, q11 // t1
+ vqsub.s16 q5, q15, q11 // t5
+ vqadd.s16 q6, q10, q14 // t2
+ vqsub.s16 q7, q10, q14 // t6
+ vqadd.s16 q10, q13, q9 // t3
+ vqsub.s16 q11, q13, q9 // t7
+
+ vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2]
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3]
+ vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+ vrshrn_8h d6, d7, q8, q9, #12 // t4a
+ vrshrn_8h d10, d11, q12, q13, #12 // t5a
+
+ vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
+
+ vrshrn_8h d14, d15, q14, q15, #12 // t6a
+ vrshrn_8h d22, d23, q8, q9, #12 // t7a
+
+ vqadd.s16 \q0, q2, q6 // out0
+ vqsub.s16 q2, q2, q6 // t2
+ vqadd.s16 \q7, q4, q10 // out7
+ vqsub.s16 q4, q4, q10 // t3
+ vqneg.s16 \q7, \q7 // out7
+
+ vqadd.s16 \q1, q3, q7 // out1
+ vqsub.s16 q3, q3, q7 // t6
+ vqadd.s16 \q6, q5, q11 // out6
+ vqsub.s16 q5, q5, q11 // t7
+ vqneg.s16 \q1, \q1 // out1
+
+ vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12)
+ vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11)
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+ vrshrn_8h d4, d5, q10, q11, #12 // out3
+ vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+ vrshrn_8h d6, d7, q12, q13, #12 // out5
+ vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+ vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
+
+ vqneg.s16 \q3, q2 // out3
+ vqneg.s16 \q5, q3 // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal q2, d23, d16, d0[0], d0[1]
+ vmull_vmlsl q3, d23, d16, d0[1], d0[0]
+ vmull_vmlal q4, d21, d18, d0[2], d0[3]
+ vrshrn.i32 d16, q2, #12 // t0a
+ vrshrn.i32 d23, q3, #12 // t1a
+ vmull_vmlsl q5, d21, d18, d0[3], d0[2]
+ vmull_vmlal q6, d19, d20, d1[0], d1[1]
+ vrshrn.i32 d18, q4, #12 // t2a
+ vrshrn.i32 d21, q5, #12 // t3a
+ vmull_vmlsl q7, d19, d20, d1[1], d1[0]
+ vmull_vmlal q2, d17, d22, d1[2], d1[3]
+ vrshrn.i32 d20, q6, #12 // t4a
+ vrshrn.i32 d19, q7, #12 // t5a
+ vmull_vmlsl q3, d17, d22, d1[3], d1[2]
+ vrshrn.i32 d22, q2, #12 // t6a
+ vrshrn.i32 d17, q3, #12 // t7a
+
+ vqadd.s16 d4, d16, d20 // t0
+ vqsub.s16 d5, d16, d20 // t4
+ vqadd.s16 d6, d23, d19 // t1
+ vqsub.s16 d7, d23, d19 // t5
+ vqadd.s16 d8, d18, d22 // t2
+ vqsub.s16 d9, d18, d22 // t6
+ vqadd.s16 d18, d21, d17 // t3
+ vqsub.s16 d19, d21, d17 // t7
+
+ vmull_vmlal q8, d5, d7, d2[3], d2[2]
+ vmull_vmlsl q10, d5, d7, d2[2], d2[3]
+ vmull_vmlsl q11, d19, d9, d2[3], d2[2]
+
+ vrshrn.i32 d5, q8, #12 // t4a
+ vrshrn.i32 d7, q10, #12 // t5a
+
+ vmull_vmlal q8, d19, d9, d2[2], d2[3]
+
+ vrshrn.i32 d9, q11, #12 // t6a
+ vrshrn.i32 d19, q8, #12 // t7a
+
+ vqadd.s16 \r0, d4, d8 // out0
+ vqsub.s16 d4, d4, d8 // t2
+ vqadd.s16 \r7, d6, d18 // out7
+ vqsub.s16 d6, d6, d18 // t3
+ vqneg.s16 \r7, \r7 // out7
+
+ vqadd.s16 \r1, d5, d9 // out1
+ vqsub.s16 d5, d5, d9 // t6
+ vqadd.s16 \r6, d7, d19 // out6
+ vqsub.s16 d7, d7, d19 // t7
+ vqneg.s16 \r1, \r1 // out1
+
+ vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20)
+ vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19)
+ vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18)
+ vrshrn.i32 d4, q9, #12 // out3
+ vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21)
+ vrshrn.i32 d5, q10, #12 // out5
+ vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21)
+ vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19)
+
+ vqneg.s16 \r3, d4 // out3
+ vqneg.s16 \r5, d5 // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ vqshl.s16 q12, q12, #1
+ vqshl.s16 q13, q13, #1
+ vqshl.s16 q14, q14, #1
+ vqshl.s16 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ bx lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blx r4
+
+ vrshr.s16 q8, q8, #1
+ vrshr.s16 q9, q9, #1
+ vrshr.s16 q10, q10, #1
+ vrshr.s16 q11, q11, #1
+ vrshr.s16 q12, q12, #1
+ vrshr.s16 q13, q13, #1
+ vrshr.s16 q14, q14, #1
+ vrshr.s16 q15, q15, #1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x8h q8, q9, q10, q11
+ vswp d17, d20
+ vswp d19, d21
+ vswp d17, d18
+ vswp d19, d22
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+
+ vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a
+ vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a
+ vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a
+ vrshrn.i32 d17, q2, #12 // t8a
+ vrshrn.i32 d31, q3, #12 // t15a
+ vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a
+ vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a
+ vrshrn.i32 d23, q4, #12 // t9a
+ vrshrn.i32 d25, q2, #12 // t14a
+ vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a
+ vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a
+ vrshrn.i32 d21, q3, #12 // t10a
+ vrshrn.i32 d27, q4, #12 // t13a
+ vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a
+ vrshrn.i32 d19, q2, #12 // t11a
+ vrshrn.i32 d29, q3, #12 // t12a
+
+ idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ vqsub.s16 d4, d17, d23 // t9
+ vqadd.s16 d17, d17, d23 // t8
+ vqsub.s16 d5, d31, d25 // t14
+ vqadd.s16 d31, d31, d25 // t15
+ vqsub.s16 d23, d19, d21 // t10
+ vqadd.s16 d19, d19, d21 // t11
+ vqadd.s16 d25, d29, d27 // t12
+ vqsub.s16 d29, d29, d27 // t13
+
+ vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a
+ vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a
+ vrshrn.i32 d21, q3, #12 // t9a
+ vrshrn.i32 d27, q4, #12 // t14a
+
+ vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a
+ vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a
+ vrshrn.i32 d29, q3, #12 // t13a
+ vneg.s32 q4, q4
+ vrshrn.i32 d23, q4, #12 // t10a
+
+ vqsub.s16 d4, d17, d19 // t11a
+ vqadd.s16 d17, d17, d19 // t8a
+ vqsub.s16 d5, d31, d25 // t12a
+ vqadd.s16 d31, d31, d25 // t15a
+ vqadd.s16 d19, d21, d23 // t9
+ vqsub.s16 d21, d21, d23 // t10
+ vqsub.s16 d25, d27, d29 // t13
+ vqadd.s16 d27, d27, d29 // t14
+
+ vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11
+ vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12
+ vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
+
+ vrshrn.i32 d6, q3, #12 // t11
+ vrshrn.i32 d7, q4, #12 // t12
+ vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a
+ vrshrn.i32 d4, q2, #12 // t10a
+ vrshrn.i32 d5, q4, #12 // t13a
+
+ vqadd.s16 d8, d16, d31 // out0
+ vqsub.s16 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s16 d23, d30, d17 // out7
+ vqsub.s16 d9, d30, d17 // out8
+ vqadd.s16 d17, d18, d27 // out1
+ vqsub.s16 d30, d18, d27 // out14
+ vqadd.s16 d18, d20, d5 // out2
+ vqsub.s16 d29, d20, d5 // out13
+ vqadd.s16 d5, d28, d19 // out6
+ vqsub.s16 d25, d28, d19 // out9
+ vqadd.s16 d19, d22, d7 // out3
+ vqsub.s16 d28, d22, d7 // out12
+ vqadd.s16 d20, d24, d6 // out4
+ vqsub.s16 d27, d24, d6 // out11
+ vqadd.s16 d21, d26, d4 // out5
+ vqsub.s16 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+
+ vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0
+ vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1
+ vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2
+ vrshrn.i32 d16, q2, #12 // t0
+ vrshrn.i32 d31, q3, #12 // t1
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3
+ vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4
+ vrshrn.i32 d18, q4, #12 // t2
+ vrshrn.i32 d29, q2, #12 // t3
+ vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5
+ vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6
+ vrshrn.i32 d20, q3, #12 // t4
+ vrshrn.i32 d27, q4, #12 // t5
+ vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7
+ vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8
+ vrshrn.i32 d22, q2, #12 // t6
+ vrshrn.i32 d25, q3, #12 // t7
+ vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9
+ vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10
+ vrshrn.i32 d23, q4, #12 // t8
+ vrshrn.i32 d24, q2, #12 // t9
+ vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11
+ vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12
+ vrshrn.i32 d21, q3, #12 // t10
+ vrshrn.i32 d26, q4, #12 // t11
+ vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13
+ vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14
+ vrshrn.i32 d19, q2, #12 // t12
+ vrshrn.i32 d28, q3, #12 // t13
+ vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15
+ vrshrn.i32 d17, q4, #12 // t14
+ vrshrn.i32 d30, q2, #12 // t15
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d23 // t8a
+ vqadd.s16 d16, d16, d23 // t0a
+ vqsub.s16 d3, d31, d24 // t9a
+ vqadd.s16 d31, d31, d24 // t1a
+ vqadd.s16 d23, d18, d21 // t2a
+ vqsub.s16 d18, d18, d21 // t10a
+ vqadd.s16 d24, d29, d26 // t3a
+ vqsub.s16 d29, d29, d26 // t11a
+ vqadd.s16 d21, d20, d19 // t4a
+ vqsub.s16 d20, d20, d19 // t12a
+ vqadd.s16 d26, d27, d28 // t5a
+ vqsub.s16 d27, d27, d28 // t13a
+ vqadd.s16 d19, d22, d17 // t6a
+ vqsub.s16 d22, d22, d17 // t14a
+ vqadd.s16 d28, d25, d30 // t7a
+ vqsub.s16 d25, d25, d30 // t15a
+
+ vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8
+ vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9
+ vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10
+ vrshrn.i32 d17, q2, #12 // t8
+ vrshrn.i32 d30, q3, #12 // t9
+ vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11
+ vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12
+ vrshrn.i32 d18, q4, #12 // t10
+ vrshrn.i32 d29, q2, #12 // t11
+ vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13
+ vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14
+ vrshrn.i32 d27, q3, #12 // t12
+ vrshrn.i32 d20, q4, #12 // t13
+ vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15
+ vrshrn.i32 d25, q2, #12 // t14
+ vrshrn.i32 d22, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t4
+ vqadd.s16 d16, d16, d21 // t0
+ vqsub.s16 d3, d31, d26 // t5
+ vqadd.s16 d31, d31, d26 // t1
+ vqadd.s16 d21, d23, d19 // t2
+ vqsub.s16 d23, d23, d19 // t6
+ vqadd.s16 d26, d24, d28 // t3
+ vqsub.s16 d24, d24, d28 // t7
+ vqadd.s16 d19, d17, d27 // t8a
+ vqsub.s16 d17, d17, d27 // t12a
+ vqadd.s16 d28, d30, d20 // t9a
+ vqsub.s16 d30, d30, d20 // t13a
+ vqadd.s16 d27, d18, d25 // t10a
+ vqsub.s16 d18, d18, d25 // t14a
+ vqadd.s16 d20, d29, d22 // t11a
+ vqsub.s16 d29, d29, d22 // t15a
+
+ vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a
+ vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a
+ vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a
+ vrshrn.i32 d22, q2, #12 // t4a
+ vrshrn.i32 d25, q3, #12 // t5a
+ vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a
+ vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12
+ vrshrn.i32 d24, q4, #12 // t6a
+ vrshrn.i32 d23, q2, #12 // t7a
+ vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14
+ vrshrn.i32 d17, q3, #12 // t12
+ vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15
+ vrshrn.i32 d29, q4, #12 // t13
+ vrshrn.i32 d30, q2, #12 // t14
+ vrshrn.i32 d18, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s16 \o0, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+.else
+ vqadd.s16 d4, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+ vqneg.s16 \o15, \o15 // out15
+
+ vqsub.s16 d3, d29, d18 // t15a
+ vqadd.s16 \o13,d29, d18 // out13
+ vqadd.s16 \o2, d17, d30 // out2
+ vqsub.s16 d26, d17, d30 // t14a
+ vqneg.s16 \o13,\o13 // out13
+
+ vqadd.s16 \o1, d19, d27 // out1
+ vqsub.s16 d27, d19, d27 // t10
+ vqadd.s16 \o14,d28, d20 // out14
+ vqsub.s16 d20, d28, d20 // t11
+ vqneg.s16 \o1, \o1 // out1
+
+ vqadd.s16 \o3, d22, d24 // out3
+ vqsub.s16 d22, d22, d24 // t6
+ vqadd.s16 \o12,d25, d23 // out12
+ vqsub.s16 d23, d25, d23 // t7
+ vqneg.s16 \o3, \o3 // out3
+
+ vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vrshrn.i32 d24, q12, #12 // out8
+ vrshrn.i32 d4, q2, #12 // out7
+ vrshrn.i32 d5, q3, #12 // out5
+ vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vrshrn.i32 d26, q4, #12 // out10
+
+ vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vrshrn.i32 \o4, q1, #12 // out4
+ vrshrn.i32 d7, q3, #12 // out9
+ vrshrn.i32 d6, q4, #12 // out11
+ vrshrn.i32 \o6, q11, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s16 \o7, d4 // out7
+ vqneg.s16 \o5, d5 // out5
+ vqneg.s16 \o11,d6 // out11
+ vqneg.s16 \o9, d7 // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q1, \i, d0[0]
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vshr.s16 q2, q2, #1
+ vrhadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vrshr.s16 q2, q2, #1
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+ identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+.if \identity
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.endif
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+ identity_4x16_shift2 d0[0]
+.else
+ identity_4x16_shift1 d0[0]
+.endif
+.else
+ blx r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #\shift
+.endr
+.endif
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+ vst1.16 {\i}, [r6, :64]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+.macro sub_sp_align space
+#if CONFIG_THUMB
+ mov r7, sp
+ and r7, r7, #15
+#else
+ and r7, sp, #15
+#endif
+ sub sp, sp, r7
+ // Now the stack is aligned, store the amount of adjustment back
+ // on the stack, as we don't want to waste a register as frame
+ // pointer.
+ str r7, [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub r7, sp, #4096
+ ldr r12, [r7]
+ sub r7, r7, #(\space - 4096)
+ mov sp, r7
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+ add sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ add sp, sp, #(\space)%4096
+.endif
+ ldr r7, [sp], #16
+ // Add back the original stack adjustment
+ add sp, sp, r7
+.endm
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ blx r9
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4}
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_identity_16x4_neon
+.else
+ movrel_local r9, inv_txfm_horz_16x4_neon
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+ vmov.i16 d4, #0
+.irp i, d16, d18, d20, d22
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, d24, d26, d28, d30
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+
+ identity_4x16_shift1 d0[0]
+.else
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+
+ blx r4
+
+ vswp d17, d20
+ vswp d19, d22
+ vswp d18, d20
+ vswp d19, d21
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q14
+ vmov q11, q15
+.else
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+ vrshr.s16 q8, q12, #1
+ vrshr.s16 q9, q13, #1
+ vrshr.s16 q10, q14, #1
+ vrshr.s16 q11, q15, #1
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ add r6, r0, #8
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ vmov.i16 q2, #0
+
+ mov r11, #32
+ cmp r3, r10
+ blt 1f
+
+ add r6, r2, #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vrshr.s16 q12, q8, #1
+ vrshr.s16 q13, q9, #1
+ vrshr.s16 q14, q10, #1
+ vrshr.s16 q15, q11, #1
+.endif
+ transpose_4x8h q12, q13, q14, q15
+ vswp d27, d29
+ vswp d26, d28
+ vswp d27, d30
+ vswp d25, d28
+
+ b 2f
+1:
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+2:
+ vmov.i16 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+.else
+ blx r4
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ vswp d19, d21
+ vswp d18, d20
+ vswp d19, d22
+ vswp d17, d20
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
+ mov r10, #\eob_half
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ sub_sp_align 256
+
+.irp i, 0, 4
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #8*2
+ blx r9
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ sub_sp_align 256
+
+.irp i, 0, 8
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+
+ vmov.i16 q2, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128]
+ vst1.16 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+ // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+ blx r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \j, \j, #1
+.endr
+.endif
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+ vst1.16 {q8, q9}, [r6, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+ movrel_local r9, inv_txfm_horz_scale_16x4_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+ mov r10, #\eob_8x8
+.else
+ mov r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs, 2*16
+ vld1.16 {q0, q1}, [r12, :128]
+ sub r12, r12, #2*16
+
+ vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a
+ vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a
+ vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a
+ vrshrn.i32 d16, q2, #12 // t16a
+ vrshrn.i32 d31, q3, #12 // t31a
+ vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a
+ vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a
+ vrshrn.i32 d24, q4, #12 // t17a
+ vrshrn.i32 d23, q2, #12 // t30a
+ vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a
+ vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a
+ vrshrn.i32 d20, q3, #12 // t18a
+ vrshrn.i32 d27, q4, #12 // t29a
+ vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a
+ vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a
+ vrshrn.i32 d28, q2, #12 // t19a
+ vrshrn.i32 d19, q3, #12 // t28a
+ vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a
+ vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a
+ vrshrn.i32 d18, q4, #12 // t20a
+ vrshrn.i32 d29, q2, #12 // t27a
+ vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a
+ vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a
+ vrshrn.i32 d26, q3, #12 // t21a
+ vrshrn.i32 d21, q4, #12 // t26a
+ vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a
+ vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a
+ vrshrn.i32 d22, q2, #12 // t22a
+ vrshrn.i32 d25, q3, #12 // t25a
+ vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a
+ vrshrn.i32 d30, q4, #12 // t23a
+ vrshrn.i32 d17, q2, #12 // t24a
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d24 // t17
+ vqadd.s16 d16, d16, d24 // t16
+ vqsub.s16 d3, d31, d23 // t30
+ vqadd.s16 d31, d31, d23 // t31
+ vqsub.s16 d24, d28, d20 // t18
+ vqadd.s16 d28, d28, d20 // t19
+ vqadd.s16 d23, d18, d26 // t20
+ vqsub.s16 d18, d18, d26 // t21
+ vqsub.s16 d20, d30, d22 // t22
+ vqadd.s16 d30, d30, d22 // t23
+ vqadd.s16 d26, d17, d25 // t24
+ vqsub.s16 d17, d17, d25 // t25
+ vqsub.s16 d22, d29, d21 // t26
+ vqadd.s16 d29, d29, d21 // t27
+ vqadd.s16 d25, d19, d27 // t28
+ vqsub.s16 d19, d19, d27 // t29
+
+ vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a
+ vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a
+ vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a
+ vrshrn.i32 d21, q2, #12 // t17a
+ vrshrn.i32 d27, q3, #12 // t30a
+ vneg.s32 q4, q4 // -> t18a
+ vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a
+ vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a
+ vrshrn.i32 d19, q4, #12 // t18a
+ vrshrn.i32 d24, q1, #12 // t29a
+ vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a
+ vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a
+ vrshrn.i32 d22, q2, #12 // t21a
+ vrshrn.i32 d18, q3, #12 // t26a
+ vneg.s32 q4, q4 // -> t22a
+ vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a
+ vrshrn.i32 d17, q4, #12 // t22a
+ vrshrn.i32 d20, q1, #12 // t25a
+
+ vqsub.s16 d2, d27, d24 // t29
+ vqadd.s16 d27, d27, d24 // t30
+ vqsub.s16 d3, d21, d19 // t18
+ vqadd.s16 d21, d21, d19 // t17
+ vqsub.s16 d24, d16, d28 // t19a
+ vqadd.s16 d16, d16, d28 // t16a
+ vqsub.s16 d19, d30, d23 // t20a
+ vqadd.s16 d30, d30, d23 // t23a
+ vqsub.s16 d28, d17, d22 // t21
+ vqadd.s16 d17, d17, d22 // t22
+ vqadd.s16 d23, d26, d29 // t24a
+ vqsub.s16 d26, d26, d29 // t27a
+ vqadd.s16 d22, d20, d18 // t25
+ vqsub.s16 d20, d20, d18 // t26
+ vqsub.s16 d29, d31, d25 // t28a
+ vqadd.s16 d31, d31, d25 // t31a
+
+ vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a
+ vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a
+ vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19
+ vrshrn.i32 d18, q2, #12 // t18a
+ vrshrn.i32 d25, q3, #12 // t29a
+ vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28
+ vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20
+ vrshrn.i32 d29, q4, #12 // t19
+ vrshrn.i32 d24, q1, #12 // t28
+ vneg.s32 q2, q2 // -> t20
+ vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27
+ vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a
+ vrshrn.i32 d26, q2, #12 // t20
+ vrshrn.i32 d19, q3, #12 // t27
+ vneg.s32 q4, q4 // -> t21a
+ vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a
+ vrshrn.i32 d20, q4, #12 // t21a
+ vrshrn.i32 d28, q1, #12 // t26a
+
+ vqsub.s16 d2, d16, d30 // t23
+ vqadd.s16 d16, d16, d30 // t16 = out16
+ vqsub.s16 d3, d31, d23 // t24
+ vqadd.s16 d31, d31, d23 // t31 = out31
+ vqsub.s16 d23, d21, d17 // t22a
+ vqadd.s16 d17, d21, d17 // t17a = out17
+ vqadd.s16 d30, d27, d22 // t30a = out30
+ vqsub.s16 d21, d27, d22 // t25a
+ vqsub.s16 d27, d18, d20 // t21
+ vqadd.s16 d18, d18, d20 // t18 = out18
+ vqadd.s16 d4, d29, d26 // t19a = out19
+ vqsub.s16 d26, d29, d26 // t20a
+ vqadd.s16 d29, d25, d28 // t29 = out29
+ vqsub.s16 d25, d25, d28 // t26
+ vqadd.s16 d28, d24, d19 // t28a = out28
+ vqsub.s16 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+ vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20
+ vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27
+ vrshrn.i32 d20, q2, #12 // t20
+ vrshrn.i32 d22, q3, #12 // t27
+
+ vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a
+ vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vrshrn.i32 d26, q2, #12 // t26a
+
+ vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22
+ vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25
+ vrshrn.i32 d21, q3, #12 // t21a
+ vrshrn.i32 d22, q12, #12 // t22
+ vrshrn.i32 d25, q2, #12 // t25
+
+ vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a
+ vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a
+ vrshrn.i32 d23, q2, #12 // t23a
+ vrshrn.i32 d24, q3, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_4h_x16_neon
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+ add r6, r6, #32
+.endm
+ store1 d16, d20, d24, d28
+ store1 d17, d21, d25, d29
+ store1 d18, d22, d26, d30
+ store1 d19, d23, d27, d31
+.purgem store1
+ sub r6, r6, #64*4
+
+ vmov.i16 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_4h_x16_neon
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+ transpose_4x4h q11, q10, d23, d22, d21, d20
+ transpose_4x4h q9, q8, d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+ vld1.16 {q0, q1}, [r6, :128]
+ vqsub.s16 d7, d0, \r0
+ vqadd.s16 d0, d0, \r0
+ vqsub.s16 d6, d1, \r1
+ vqadd.s16 d1, d1, \r1
+ vqsub.s16 d5, d2, \r2
+ vqadd.s16 d2, d2, \r2
+ vqsub.s16 d4, d3, \r3
+ vqadd.s16 d3, d3, \r3
+ vrev64.16 q2, q2
+ vrev64.16 q3, q3
+ vrshr.s16 q0, q0, #\shift
+ vrshr.s16 q1, q1, #\shift
+ vrshr.s16 q2, q2, #\shift
+ vrshr.s16 q3, q3, #\shift
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d27, d23, d19, \shift
+ store2 d30, d26, d22, d18, \shift
+ store2 d29, d25, d21, d17, \shift
+ store2 d28, d24, d20, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl inv_dct32_odd_4h_x16_neon
+
+ neg r9, r8
+ mov r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.32 {d2[0]}, [r10, :32], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.32 {d2[1]}, [r10, :32], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.32 {d3[0]}, [r10, :32], r1
+ \op\().s16 d5, d5, \r1
+ vld1.32 {d3[1]}, [r10, :32], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vaddw.u8 q2, q2, d2
+ \op\().s16 d7, d7, \r3
+ vqmovun.s16 d2, q2
+ vrshr.s16 q3, q3, #4
+ vst1.32 {d2[0]}, [r6, :32], r1
+ vaddw.u8 q3, q3, d3
+ vst1.32 {d2[1]}, [r6, :32], r1
+ vqmovun.s16 d3, q3
+ vst1.32 {d3[0]}, [r6, :32], r1
+ vst1.32 {d3[1]}, [r6, :32], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ // Contrary to the others, this one is only ever used in increments of 8x8
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vmov.i16 q0, #0
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #2*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 2
+2:
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r6
+.endr
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ load_add_store_8x8 r0, r7, shiftbits=2
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r7,lr}
+ movw r6, #2896*8
+ movw r7, #2*(5793-4096)*8
+ vdup.i16 d0, r6
+ movrel_local r5, eob_16x32\hshort, 2
+ vmov.16 d0[1], r7
+
+ mov r6, #2*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 2
+2:
+ vmov.i16 q1, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s16, 1
+ identity_8x8 d0[1]
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+ load_add_store_8x8 r0, r7, shiftbits=2
+.else
+ load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q0, #0
+ movrel_local r4, eob_8x32
+
+ mov r12, #2*\h
+1:
+ ldrh lr, [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs vrshr.s16, 1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ cmp r3, lr
+.if \w == 8
+ load_add_store_8x8 r0, r5, shiftbits=2
+.else
+ load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+ blt 9f
+.if \w == 8
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #2*8
+.else
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #8
+.endif
+ b 1b
+
+9:
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r5, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32
+
+ mov r8, #2*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i16 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #2
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #8
+ add r2, r2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #2
+.endr
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #8
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+.irp i, 0, 4
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ cmp r3, #10
+ blt 1f
+.endif
+ mov r8, #8*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+
+2:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.16 {d0, d1, d2}, [r12, :64]!
+
+ vqrdmulh.s16 d23, d16, d0[1] // t63a
+ vqrdmulh.s16 d16, d16, d0[0] // t32a
+ vqrdmulh.s16 d22, d17, d0[2] // t62a
+ vqrdmulh.s16 d17, d17, d0[3] // t33a
+ vqrdmulh.s16 d21, d18, d1[1] // t61a
+ vqrdmulh.s16 d18, d18, d1[0] // t34a
+ vqrdmulh.s16 d20, d19, d1[2] // t60a
+ vqrdmulh.s16 d19, d19, d1[3] // t35a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t33
+ vqsub.s16 d26, d19, d18 // t34
+ vqadd.s16 d27, d19, d18 // t35
+ vqadd.s16 d28, d20, d21 // t60
+ vqsub.s16 d29, d20, d21 // t61
+ vqsub.s16 d30, d23, d22 // t62
+ vqadd.s16 d31, d23, d22 // t63
+
+ vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a
+ vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a
+ vneg.s32 q2, q2 // t34a
+ vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a
+ vrshrn.i32 d26, q2, #12 // t34a
+ vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a
+ vrshrn.i32 d29, q3, #12 // t61a
+ vrshrn.i32 d25, q4, #12 // t33a
+ vrshrn.i32 d30, q2, #12 // t62a
+
+ vqadd.s16 d16, d24, d27 // t32a
+ vqsub.s16 d19, d24, d27 // t35a
+ vqadd.s16 d17, d25, d26 // t33
+ vqsub.s16 d18, d25, d26 // t34
+ vqsub.s16 d20, d31, d28 // t60a
+ vqadd.s16 d23, d31, d28 // t63a
+ vqsub.s16 d21, d30, d29 // t61
+ vqadd.s16 d22, d30, d29 // t62
+
+ vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a
+ vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a
+ vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60
+ vrshrn.i32 d21, q2, #12 // t61a
+ vrshrn.i32 d18, q3, #12 // t34a
+ vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35
+ vrshrn.i32 d20, q4, #12 // t60
+ vrshrn.i32 d19, q2, #12 // t35
+
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #2*4*0] // t32a
+ vldr d17, [r9, #2*4*8] // t39a
+ vldr d18, [r9, #2*4*0] // t63a
+ vldr d19, [r6, #2*4*8] // t56a
+ vldr d20, [r6, #2*4*16] // t40a
+ vldr d21, [r9, #2*4*24] // t47a
+ vldr d22, [r9, #2*4*16] // t55a
+ vldr d23, [r6, #2*4*24] // t48a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t39
+ vqadd.s16 d26, d18, d19 // t63
+ vqsub.s16 d27, d18, d19 // t56
+ vqsub.s16 d28, d21, d20 // t40
+ vqadd.s16 d29, d21, d20 // t47
+ vqadd.s16 d30, d23, d22 // t48
+ vqsub.s16 d31, d23, d22 // t55
+
+ vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a
+ vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a
+ vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a
+ vrshrn.i32 d25, q2, #12 // t56a
+ vrshrn.i32 d27, q3, #12 // t39a
+ vneg.s32 q4, q4 // t40a
+ vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a
+ vrshrn.i32 d31, q4, #12 // t40a
+ vrshrn.i32 d28, q2, #12 // t55a
+
+ vqadd.s16 d16, d24, d29 // t32a
+ vqsub.s16 d19, d24, d29 // t47a
+ vqadd.s16 d17, d27, d31 // t39
+ vqsub.s16 d18, d27, d31 // t40
+ vqsub.s16 d20, d26, d30 // t48a
+ vqadd.s16 d23, d26, d30 // t63a
+ vqsub.s16 d21, d25, d28 // t55
+ vqadd.s16 d22, d25, d28 // t56
+
+ vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a
+ vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a
+ vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47
+ vrshrn.i32 d18, q2, #12 // t40a
+ vrshrn.i32 d21, q3, #12 // t55a
+ vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48
+ vrshrn.i32 d19, q4, #12 // t47
+ vrshrn.i32 d20, q2, #12 // t48
+
+ vstr d16, [r6, #2*4*0] // t32a
+ vstr d17, [r9, #2*4*0] // t39
+ vstr d18, [r6, #2*4*8] // t40a
+ vstr d19, [r9, #2*4*8] // t47
+ vstr d20, [r6, #2*4*16] // t48
+ vstr d21, [r9, #2*4*16] // t55a
+ vstr d22, [r6, #2*4*24] // t56
+ vstr d23, [r9, #2*4*24] // t63a
+
+ add r6, r6, #2*4
+ sub r9, r9, #2*4
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.16 {\i}, [\src, :64]
+ vst1.16 {\zero}, [\src, :64], \strd
+.else
+ vld1.16 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.16 {q8, q9}, [\dst, :128]!
+ vst1.16 {q10, q11}, [\dst, :128]!
+ vst1.16 {q12, q13}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i16 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ movw \gpr, \val
+ vdup.16 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.16 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_4h_x16_neon
+
+ store16 r6
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_4h_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+.macro store_addsub r0, r1, r2, r3
+ vld1.16 {d2}, [r6, :64]!
+ vld1.16 {d3}, [r6, :64]!
+ vqadd.s16 d6, d2, \r0
+ vqsub.s16 \r0, d2, \r0
+ vld1.16 {d4}, [r6, :64]!
+ vqadd.s16 d7, d3, \r1
+ vqsub.s16 \r1, d3, \r1
+ vld1.16 {d5}, [r6, :64]!
+ vqadd.s16 d2, d4, \r2
+ sub r6, r6, #8*4
+ vqsub.s16 \r2, d4, \r2
+ vst1.16 {d6}, [r6, :64]!
+ vst1.16 {\r0}, [r10, :64], r9
+ vqadd.s16 d3, d5, \r3
+ vqsub.s16 \r3, d5, \r3
+ vst1.16 {d7}, [r6, :64]!
+ vst1.16 {\r1}, [r10, :64], r9
+ vst1.16 {d2}, [r6, :64]!
+ vst1.16 {\r2}, [r10, :64], r9
+ vst1.16 {d3}, [r6, :64]!
+ vst1.16 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.16 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.16 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.16 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.16 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.16 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.16 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.16 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.16 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.16 d16, [r10, :64] // in5 (offset 2)
+ vld1.16 d17, [r11, :64] // in27 (offset 13)
+ vld1.16 d18, [r9, :64] // in21 (offset 10)
+ vld1.16 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.16 d16, [r10, :64] // in3 (offset 1)
+ vld1.16 d17, [r11, :64] // in29 (offset 14)
+ vld1.16 d18, [r9, :64] // in19 (offset 9)
+ vld1.16 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+ vdup.16 q3, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+ vqsub.s16 d3, \src0, \src1
+ vqsub.s16 d2, \src2, \src3
+ vqadd.s16 d0, \src0, \src1
+ vqadd.s16 d1, \src2, \src3
+ vrshl.s16 q1, q1, q3
+ vrshl.s16 q0, q0, q3
+ vrev64.16 q1, q1
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q1}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d20, d27
+ store_addsub d17, d30, d21, d26
+ store_addsub d18, d29, d22, d25
+ store_addsub d19, d28, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #2
+ sub r9, r9, r10, lsl #2
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.32 {d0[0]}, [r6, :32], r1
+ vld1.32 {d1[0]}, [r9, :32], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.32 {d0[1]}, [r6, :32]
+ vqadd.s16 d5, \src2, \src3
+ vld1.32 {d1[1]}, [r9, :32]
+ vqsub.s16 d6, \src0, \src1
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vaddw.u8 q2, q2, d0
+ vaddw.u8 q3, q3, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r6, :32], r1
+ vst1.32 {d1[0]}, [r9, :32], r10
+ vst1.32 {d0[1]}, [r6, :32], r1
+ vst1.32 {d1[1]}, [r9, :32], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_scale_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+ ldrh r11, [r10], #2
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 8
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel_local r5, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+ ldrh r11, [r10], #2
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S
new file mode 100644
index 00000000000..b06e109ddab
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits, align=4
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src, :128]
+.else
+ vld1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src]
+.else
+ vld1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src, :128]
+.else
+ vst1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src]
+.else
+ vst1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshr.u16 \d0, \s0, \s3
+.else
+ vshr.u16 \d1, \s1, \s4
+.if \n == 16
+ vshr.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vadd.i16 \d0, \s0, \s3
+.else
+ vadd.i16 \d1, \s1, \s4
+.if \n == 16
+ vadd.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vsub.i16 \d0, \s0, \s3
+.else
+ vsub.i16 \d1, \s1, \s4
+.if \n == 16
+ vsub.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vand \d0, \s0, \s3
+.else
+ vand \d1, \s1, \s4
+.if \n == 16
+ vand \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vcge.u16 \d0, \s0, \s3
+.else
+ vcge.u16 \d1, \s1, \s4
+.if \n == 16
+ vcge.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vrhadd.u16 \d0, \s0, \s3
+.else
+ vrhadd.u16 \d1, \s1, \s4
+.if \n == 16
+ vrhadd.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshl.s16 \d0, \s0, \s3
+.else
+ vshl.s16 \d1, \s1, \s4
+.if \n == 16
+ vshl.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vqdmulh.s16 \d0, \s0, \s3
+.else
+ vqdmulh.s16 \d1, \s1, \s4
+.if \n == 16
+ vqdmulh.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+ push {r4-r10,lr}
+ sub sp, sp, #48
+ add r8, r0, #RNG
+
+ vld1_align_n d0, q0, q1, r1, \n // cdf
+ vld1.16 {d16[]}, [r8, :16] // rng
+ movrel_local r9, coeffs, 30
+ vmov.i16 d30, #0x7f00 // 0x7f00
+ sub r9, r9, r2, lsl #1
+ vmvn.i16 q14, #0x3f // 0xffc0
+ add r8, sp, #14
+ vand d22, d16, d30 // rng & 0x7f00
+ vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng
+ vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+ vmov d23, d22
+.endif
+
+ vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret)
+ vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r8, r0, #DIF + 2
+
+ vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+ vmov.i16 d17, #0
+.endif
+ vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ add r9, sp, #16
+ vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16)
+ movrel_local r8, bits
+ vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access
+
+ vmov d21, d20
+ vld1_align_n q12, q12, q13, r8, \n
+.if \n == 16
+ vmov q11, q10
+.endif
+
+ vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v
+
+ vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+ vadd.i16 q10, q10, q11
+.endif
+ vadd.i16 d20, d20, d21 // Aggregate mask bits
+ ldr r4, [r0, #ALLOW_UPDATE_CDF]
+ vpadd.i16 d20, d20, d20
+ lsl r10, r2, #1
+ vpadd.i16 d20, d20, d20
+ vmov.u16 r3, d20[0]
+ cmp r4, #0
+ rbit r3, r3
+ clz lr, r3 // ret
+
+ beq L(renorm)
+ // update_cdf
+ ldrh r3, [r1, r10] // count = cdf[n_symbols]
+ vmov.i8 q10, #0xff
+.if \n == 16
+ mov r4, #-5
+.else
+ mvn r12, r2
+ mov r4, #-4
+ cmn r12, #3 // set C if n_symbols <= 2
+.endif
+ vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub r4, r4, r3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr r12, r3, #4 // count >> 4
+ sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+ vdup.16 d20, r4 // -rate
+.else
+ vdup.16 q10, r4 // -rate
+.endif
+
+ sub r3, r3, r3, lsr #5 // count - (count == 32)
+ vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0)
+ vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+ add r3, r3, #1 // count + (count < 32)
+ vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate
+ vst1_align_n d0, q0, q1, r1, \n
+ strh r3, [r1, r10]
+.endm
+
+ decode_update 4
+
+L(renorm):
+ add r8, sp, #16
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+L(renorm2):
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 8 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 8 - c
+
+9:
+ str r6, [r0, #CNT]
+ str r7, [r0, #DIF]
+
+ mov r0, lr
+ add sp, sp, #48
+
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ push {r4-r10,lr}
+ vld1.16 {d0}, [r1, :64] // cdf
+ add r4, r0, #RNG
+ vmov.i16 d31, #0x7f00 // 0x7f00
+ movrel_local r5, coeffs, 30-2*3
+ vmvn.i16 d30, #0x3f // 0xffc0
+ ldrh r9, [r1, #6] // count = cdf[n_symbols]
+ vld1.16 {d1[]}, [r4, :16] // rng
+ movrel_local r4, bits
+ vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret)
+ add r5, r0, #DIF + 2
+ vld1.16 {q8}, [r4, :128]
+ mov r2, #-24
+ vand d20, d0, d30 // cdf & 0xffc0
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ vmov d3, d2
+1:
+ vand d23, d1, d31 // rng & 0x7f00
+ vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r12, sp, #14
+ vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ vmov.i16 d7, #0
+ vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng
+ add r12, sp, #16
+ vcge.u16 q2, q1, q3 // c >= v
+ vst1.16 {q3}, [r12] // store v values to allow indexed access
+ vand q9, q2, q8 // One bit per halfword set in the mask
+
+ vadd.i16 d18, d18, d19 // Aggregate mask bits
+ vpadd.i16 d18, d18, d18
+ vpadd.i16 d18, d18, d18
+ vmov.u16 r3, d18[0]
+ cmp r10, #0
+ add r2, r2, #5
+ rbit r3, r3
+ add r8, sp, #16
+ clz lr, r3 // ret
+
+ beq 2f
+ // update_cdf
+ vmov.i8 d22, #0xff
+ mov r4, #-5
+ vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768
+ sub r4, r4, r9, lsr #4 // -((count >> 4) + 5)
+ vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i])
+ vdup.16 d18, r4 // -rate
+
+ sub r9, r9, r9, lsr #5 // count - (count == 32)
+ vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0)
+ vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate
+ add r9, r9, #1 // count + (count < 32)
+ vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate
+ vst1.16 {d0}, [r1, :64]
+ vand d20, d0, d30 // cdf & 0xffc0
+ strh r9, [r1, #6]
+
+2:
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ vdup.16 d1, r4
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 40 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 40 - c
+
+9:
+ lsl lr, lr, #1
+ sub lr, lr, #5
+ lsr r12, r7, #16
+ adds r2, r2, lr // carry = tok_br < 3 || tok == 15
+ vdup.16 q1, r12
+ bcc 1b // loop if !carry
+ add r2, r2, #30
+ str r6, [r0, #CNT]
+ add sp, sp, #48
+ str r7, [r0, #DIF]
+ lsr r0, r2, #1
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ bic r4, r5, #0xff // r &= 0xff00
+ add r4, r4, #8
+ mov r2, #0
+ subs r8, r7, r4, lsl #15 // dif - vw
+ lsr r4, r4, #1 // v
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ bic r1, r1, #0x3f // f &= ~63
+ mul r4, r4, r1
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ push {r4-r10,lr}
+ ldr r9, [r1] // cdf[0-1]
+ ldr r5, [r0, #RNG]
+ movw lr, #0xffc0
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ and r2, r9, lr // f &= ~63
+ mul r4, r4, r2
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ cmp r10, #0
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+
+ beq L(renorm2)
+
+ lsr r2, r9, #16 // count = cdf[1]
+ uxth r9, r9 // cdf[0]
+
+ sub r3, r2, r2, lsr #5 // count - (count >= 32)
+ lsr r2, r2, #4 // count >> 4
+ add r10, r3, #1 // count + (count < 32)
+ add r2, r2, #4 // rate = (count >> 4) | 4
+
+ sub r9, r9, lr // cdf[0] -= bit
+ sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub r9, r9, r3 // cdf[0]
+
+ strh r9, [r1]
+ strh r10, [r1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S
index ea4afc38d6b..6af0158e09b 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S
@@ -84,6 +84,23 @@
vtrn.8 \r6, \r7
.endm
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+ vswp \d0, \d4
+ vswp \d1, \d5
+ vswp \d2, \d6
+ vswp \d3, \d7
+
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
.macro transpose_4x8b q0, q1, r0, r1, r2, r3
vtrn.16 \q0, \q1
@@ -91,4 +108,19 @@
vtrn.8 \r2, \r3
.endm
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+ vtrn.32 \q0, \q1
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
#endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
index b6c0c14aab8..245af0e786e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
@@ -58,7 +58,6 @@
// indicates only a quarter of input values are set, for idct16 and up,
// a significant amount of calculation can be skipped, at the cost of more
// code duplication and special casing.
-// - Special case functions for e.g. more combinations with identity.
const idct_coeffs, align=4
// idct4
@@ -106,7 +105,7 @@ const iadst8_coeffs, align=4
.short 4076, 401, 3612, 1931
.short 2598, 3166, 1189, 3920
// idct_coeffs
- .short 2896, 2896*8, 1567, 3784, 0, 0, 0, 0
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
endconst
const iadst16_coeffs, align=4
@@ -134,13 +133,6 @@ endconst
.endif
.endm
-.macro smull_sz d0, d1, s0, c, sz
- smull \d0\().4s, \s0\().4h, \c
-.ifc \sz, .8h
- smull2 \d1\().4s, \s0\().8h, \c
-.endif
-.endm
-
.macro rshrn_sz d0, s0, s1, shift, sz
rshrn \d0\().4h, \s0\().4s, \shift
.ifc \sz, .8h
@@ -457,14 +449,14 @@ endfunc
sqsub \r2\sz, v3\sz, v7\sz
.endm
-function inv_dct_4x4_neon
+function inv_dct_4h_x4_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.4h}, [x16]
idct_4 v16, v17, v18, v19, .4h
ret
endfunc
-function inv_dct_8x4_neon
+function inv_dct_8h_x4_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.4h}, [x16]
idct_4 v16, v17, v18, v19, .8h
@@ -497,12 +489,12 @@ endfunc
rshrn \o3\().4h, \o3\().4s, #12
.endm
-function inv_adst_4x4_neon
+function inv_adst_4h_x4_neon, export=1
iadst_4x4 v16, v17, v18, v19
ret
endfunc
-function inv_flipadst_4x4_neon
+function inv_flipadst_4h_x4_neon, export=1
iadst_4x4 v19, v18, v17, v16
ret
endfunc
@@ -563,17 +555,17 @@ endfunc
rshrn2 \o3\().8h, v5.4s, #12
.endm
-function inv_adst_8x4_neon
+function inv_adst_8h_x4_neon, export=1
iadst_8x4 v16, v17, v18, v19
ret
endfunc
-function inv_flipadst_8x4_neon
+function inv_flipadst_8h_x4_neon, export=1
iadst_8x4 v19, v18, v17, v16
ret
endfunc
-function inv_identity_4x4_neon
+function inv_identity_4h_x4_neon, export=1
mov w16, #(5793-4096)*8
dup v0.4h, w16
sqrdmulh v4.4h, v16.4h, v0.h[0]
@@ -587,7 +579,7 @@ function inv_identity_4x4_neon
ret
endfunc
-function inv_identity_8x4_neon
+function inv_identity_8h_x4_neon, export=1
mov w16, #(5793-4096)*8
dup v0.4h, w16
sqrdmulh v4.8h, v16.8h, v0.h[0]
@@ -608,7 +600,7 @@ endfunc
.endr
.endm
-function inv_txfm_add_wht_wht_4x4_neon, export=1
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
mov x15, x30
movi v31.8h, #0
ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
@@ -672,7 +664,7 @@ L(itx_4x4_end):
endfunc
.macro def_fn_4x4 txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
mov x15, x30
.ifc \txfm1\()_\txfm2, dct_dct
@@ -692,8 +684,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1
b L(itx_4x4_end)
1:
.endif
- adr x4, inv_\txfm1\()_4x4_neon
- adr x5, inv_\txfm2\()_4x4_neon
+ adr x4, inv_\txfm1\()_4h_x4_neon
+ adr x5, inv_\txfm2\()_4h_x4_neon
b inv_txfm_add_4x4_neon
endfunc
.endm
@@ -749,14 +741,14 @@ def_fn_4x4 identity, flipadst
mov \r6\szb, v6\szb // out6
.endm
-function inv_dct_8x8_neon
+function inv_dct_8h_x8_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h}, [x16]
idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
ret
endfunc
-function inv_dct_4x8_neon
+function inv_dct_4h_x8_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h}, [x16]
idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
@@ -830,27 +822,27 @@ endfunc
sqneg \o5\()\sz, v3\sz // out5
.endm
-function inv_adst_8x8_neon
+function inv_adst_8h_x8_neon, export=1
iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h
ret
endfunc
-function inv_flipadst_8x8_neon
+function inv_flipadst_8h_x8_neon, export=1
iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h
ret
endfunc
-function inv_adst_4x8_neon
+function inv_adst_4h_x8_neon, export=1
iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h
ret
endfunc
-function inv_flipadst_4x8_neon
+function inv_flipadst_4h_x8_neon, export=1
iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h
ret
endfunc
-function inv_identity_8x8_neon
+function inv_identity_8h_x8_neon, export=1
sqshl v16.8h, v16.8h, #1
sqshl v17.8h, v17.8h, #1
sqshl v18.8h, v18.8h, #1
@@ -862,7 +854,7 @@ function inv_identity_8x8_neon
ret
endfunc
-function inv_identity_4x8_neon
+function inv_identity_4h_x8_neon, export=1
sqshl v16.4h, v16.4h, #1
sqshl v17.4h, v17.4h, #1
sqshl v18.4h, v18.4h, #1
@@ -913,17 +905,17 @@ def_fn_8x8_base
def_fn_8x8_base identity_
.macro def_fn_8x8 txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
mov x15, x30
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 8, 8, 1
.endif
- adr x5, inv_\txfm2\()_8x8_neon
+ adr x5, inv_\txfm2\()_8h_x8_neon
.ifc \txfm1, identity
b inv_txfm_identity_add_8x8_neon
.else
- adr x4, inv_\txfm1\()_8x8_neon
+ adr x4, inv_\txfm1\()_8h_x8_neon
b inv_txfm_add_8x8_neon
.endif
endfunc
@@ -1000,14 +992,14 @@ function inv_txfm_add_4x8_neon
endfunc
.macro def_fn_48 w, h, txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
mov x15, x30
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 0
.endif
- adr x4, inv_\txfm1\()_\h\()x\w\()_neon
- adr x5, inv_\txfm2\()_\w\()x\h\()_neon
+ adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon
b inv_txfm_add_\w\()x\h\()_neon
endfunc
.endm
@@ -1118,14 +1110,14 @@ def_fns_48 8, 4
mov v22\szb, v3\szb
.endm
-function inv_dct_8x16_neon
+function inv_dct_8h_x16_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h, v1.8h}, [x16]
idct_16 .8h, .16b
ret
endfunc
-function inv_dct_4x16_neon
+function inv_dct_4h_x16_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h, v1.8h}, [x16]
idct_16 .4h, .8b
@@ -1302,27 +1294,27 @@ endfunc
sqneg \o9\sz, v7\sz // out9
.endm
-function inv_adst_8x16_neon
+function inv_adst_8h_x16_neon, export=1
iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
ret
endfunc
-function inv_flipadst_8x16_neon
+function inv_flipadst_8h_x16_neon, export=1
iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
ret
endfunc
-function inv_adst_4x16_neon
+function inv_adst_4h_x16_neon, export=1
iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
ret
endfunc
-function inv_flipadst_4x16_neon
+function inv_flipadst_4h_x16_neon, export=1
iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
ret
endfunc
-function inv_identity_8x16_neon
+function inv_identity_8h_x16_neon, export=1
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1333,7 +1325,7 @@ function inv_identity_8x16_neon
ret
endfunc
-function inv_identity_4x16_neon
+function inv_identity_4h_x16_neon, export=1
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1376,71 +1368,49 @@ endfunc
.endr
.endm
-function inv_txfm_horz_16x8_neon
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
mov x14, x30
movi v7.8h, #0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x7]
- st1 {v7.8h}, [x7], x8
-.endr
- blr x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- srshr v\i\().8h, v\i\().8h, #2
-.endr
- transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
- transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
-
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
- st1 {v\i\().8h}, [x6], #16
-.endr
-
- br x14
-endfunc
-
-function inv_txfm_horz_identity_16x8_neon
- mov x14, x30
- movi v7.8h, #0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x7]
- st1 {v7.8h}, [x7], x8
-.endr
+.if \identity
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
- identity_8x16_shift2 v0.h[0]
- transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
- transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
-
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
- st1 {v\i\().8h}, [x6], #16
-.endr
-
- br x14
-endfunc
-
-function inv_txfm_horz_scale_16x8_neon
- mov x14, x30
- movi v7.8h, #0
+.elseif \scale
mov w16, #2896*8
dup v0.4h, w16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x7]
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
st1 {v7.8h}, [x7], x8
.endr
+.if \scale
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+ identity_8x16_shift2 v0.h[0]
+.else
blr x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- srshr v\i\().8h, v\i\().8h, #1
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ srshr \i, \i, #\shift
.endr
+.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
- st1 {v\i\().8h}, [x6], #16
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+ st1 {\i}, [x6], #16
.endr
br x14
endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
function inv_txfm_add_vert_8x16_neon
mov x14, x30
@@ -1487,7 +1457,7 @@ function inv_txfm_add_16x16_neon
endfunc
.macro def_fn_16x16 txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 16, 16, 2
.endif
@@ -1495,9 +1465,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
adr x9, inv_txfm_horz_identity_16x8_neon
.else
adr x9, inv_txfm_horz_16x8_neon
- adr x4, inv_\txfm1\()_8x16_neon
+ adr x4, inv_\txfm1\()_8h_x16_neon
.endif
- adr x5, inv_\txfm2\()_8x16_neon
+ adr x5, inv_\txfm2\()_8h_x16_neon
mov x13, #\eob_half
b inv_txfm_add_16x16_neon
endfunc
@@ -1659,17 +1629,17 @@ def_fn_416_base
def_fn_416_base identity_
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
.if \w == 4
- adr x4, inv_\txfm1\()_8x\w\()_neon
- adr x5, inv_\txfm2\()_4x\h\()_neon
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_4h_x\h\()_neon
mov w13, #\eob_half
.else
- adr x4, inv_\txfm1\()_4x\w\()_neon
- adr x5, inv_\txfm2\()_8x\h\()_neon
+ adr x4, inv_\txfm1\()_4h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
@@ -1842,12 +1812,12 @@ def_fn_816_base
def_fn_816_base identity_
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
- adr x4, inv_\txfm1\()_8x\w\()_neon
- adr x5, inv_\txfm2\()_8x\h\()_neon
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
.if \w == 8
mov x13, #\eob_half
.endif
@@ -1881,7 +1851,7 @@ def_fn_816 \w, \h, identity, flipadst, 64
def_fns_816 8, 16
def_fns_816 16, 8
-function inv_dct32_odd_8x16_neon
+function inv_dct32_odd_8h_x16_neon, export=1
movrel x16, idct_coeffs, 2*16
ld1 {v0.8h, v1.8h}, [x16]
sub x16, x16, #2*16
@@ -2059,7 +2029,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
@@ -2089,15 +2059,13 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
.endif
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
.macro store2 r0, r1, shift
- ld1 {v4.8h}, [x6], #16
- ld1 {v5.8h}, [x6]
+ ld1 {v4.8h, v5.8h}, [x6]
sqsub v7.8h, v4.8h, \r0
sqsub v6.8h, v5.8h, \r1
- sub x6, x6, #16
sqadd v4.8h, v4.8h, \r0
sqadd v5.8h, v5.8h, \r1
rev64 v6.8h, v6.8h
@@ -2106,12 +2074,10 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
srshr v5.8h, v5.8h, #\shift
srshr v6.8h, v6.8h, #\shift
srshr v7.8h, v7.8h, #\shift
- st1 {v4.8h}, [x6], #16
ext v6.16b, v6.16b, v6.16b, #8
- st1 {v5.8h}, [x6], #16
+ st1 {v4.8h, v5.8h}, [x6], #32
ext v7.16b, v7.16b, v7.16b, #8
- st1 {v6.8h}, [x6], #16
- st1 {v7.8h}, [x6], #16
+ st1 {v6.8h, v7.8h}, [x6], #32
.endm
store2 v31.8h, v23.8h, \shift
@@ -2139,7 +2105,7 @@ function inv_txfm_add_vert_dct_8x32_neon
.endr
sub x7, x7, x8, lsl #4
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
st1 {v\i\().8h}, [x7], x8
@@ -2152,7 +2118,7 @@ function inv_txfm_add_vert_dct_8x32_neon
.endr
sub x7, x7, x8, lsl #4
sub x7, x7, x8, lsr #1
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
neg x9, x8
mov x10, x6
@@ -2216,7 +2182,7 @@ const eob_8x32
.short 43, 107, 171, 256
endconst
-function inv_txfm_add_identity_identity_32x32_neon, export=1
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
movi v0.8h, #0
movrel x13, eob_32x32
@@ -2259,7 +2225,7 @@ endfunc
.endm
.macro def_identity_1632 w, h, wshort, hshort
-function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
mov w16, #2896*8
mov w17, #2*(5793-4096)*8
dup v1.4h, w16
@@ -2285,7 +2251,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
.else
// 32x16
shift_8_regs sqshl, 1
- identity_8x8 v1.h[1]
+ identity_8x8 v1.h[1]
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
@@ -2319,12 +2285,13 @@ def_identity_1632 16, 32, _shortside,
def_identity_1632 32, 16, , _shortside
.macro def_identity_832 w, h
-function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
movi v0.8h, #0
movrel x13, eob_8x32
mov w8, #2*\h
1:
+ ldrh w12, [x13], #2
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v0.8h}, [x2], x8
@@ -2337,14 +2304,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ cmp w3, w12
.if \w == 8
load_add_store_8x8 x0, x7, shiftbits=2
.else
load_add_store_8x8 x0, x7, shiftbits=3
.endif
- ldrh w12, [x13], #2
- cmp w3, w12
b.lt 9f
.if \w == 8
sub x2, x2, x8, lsl #3
@@ -2363,7 +2329,7 @@ endfunc
def_identity_832 8, 32
def_identity_832 32, 8
-function inv_txfm_add_dct_dct_32x32_neon, export=1
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
idct_dc 32, 32, 2
mov x15, x30
@@ -2411,14 +2377,14 @@ function inv_txfm_add_dct_dct_32x32_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_16x32_neon, export=1
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
idct_dc 16, 32, 1
mov x15, x30
sub sp, sp, #1024
movrel x13, eob_16x32
ldrh w12, [x13], #2
- adr x4, inv_dct_8x16_neon
+ adr x4, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24
add x6, sp, #(\i*16*2)
@@ -2460,13 +2426,13 @@ function inv_txfm_add_dct_dct_16x32_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_32x16_neon, export=1
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
idct_dc 32, 16, 1
mov x15, x30
sub sp, sp, #1024
- adr x5, inv_dct_8x16_neon
+ adr x5, inv_dct_8h_x16_neon
.irp i, 0, 8
add x6, sp, #(\i*32*2)
@@ -2505,7 +2471,7 @@ function inv_txfm_add_dct_dct_32x16_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_8x32_neon, export=1
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
idct_dc 8, 32, 2
mov x15, x30
@@ -2517,18 +2483,17 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
mov x8, #2*32
mov w9, #32
mov x6, sp
- mov x7, x2
1:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x7]
- st1 {v28.8h}, [x7], x8
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
.endr
ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
sub w9, w9, #8
- sub x7, x7, x8, lsl #3
- add x7, x7, #2*8
+ add x2, x2, #2*8
- bl inv_dct_8x8_neon
+ bl inv_dct_8h_x8_neon
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #2
@@ -2536,10 +2501,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
cmp w3, w12
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- st1 {v\i\().8h}, [x6], #16
-.endr
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
b.ge 1b
cbz w9, 3f
@@ -2564,7 +2528,7 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_32x8_neon, export=1
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
idct_dc 32, 8, 2
mov x15, x30
@@ -2586,7 +2550,7 @@ function inv_txfm_add_dct_dct_32x8_neon, export=1
.endr
add w9, w9, #8
- bl inv_dct_8x8_neon
+ bl inv_dct_8h_x8_neon
cmp w9, #32
@@ -2791,7 +2755,7 @@ endfunc
.endm
.macro def_dct64_func suffix, clear=0, scale=0
-function inv_txfm_dct\suffix\()_8x64_neon
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
mov x14, x30
mov x6, sp
lsl x8, x8, #2
@@ -2804,7 +2768,7 @@ function inv_txfm_dct\suffix\()_8x64_neon
add x7, x7, x8, lsr #1
scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
store16 x6
@@ -2817,7 +2781,7 @@ function inv_txfm_dct\suffix\()_8x64_neon
sub x7, x7, x8, lsr #1
scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
add x10, x6, #16*15
sub x6, x6, #16*16
@@ -3040,7 +3004,11 @@ endfunc
.macro sub_sp space
#ifdef _WIN32
-.if \space > 4096
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
sub x16, sp, #4096
ldr xzr, [x16]
sub sp, x16, #(\space - 4096)
@@ -3050,16 +3018,14 @@ endfunc
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
+.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
-.else
- sub sp, sp, #\space
-.endif
#endif
.endm
-function inv_txfm_add_dct_dct_64x64_neon, export=1
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
idct_dc 64, 64, 2
mov x15, x30
@@ -3079,7 +3045,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
add x7, x2, #(\i*2)
mov x8, #32*2
mov x12, #-2 // shift
- bl inv_txfm_dct_clear_8x64_neon
+ bl inv_txfm_dct_clear_8h_x64_neon
add x6, x5, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
.if \i < 24
@@ -3104,7 +3070,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
.irp i, 0, 8, 16, 24, 32, 40, 48, 56
add x7, x5, #(\i*2)
mov x8, #64*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
@@ -3113,7 +3079,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_64x32_neon, export=1
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
idct_dc 64, 32, 1
mov x15, x30
@@ -3133,7 +3099,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1
add x7, x2, #(\i*2)
mov x8, #32*2
mov x12, #-1 // shift
- bl inv_txfm_dct_clear_scale_8x64_neon
+ bl inv_txfm_dct_clear_scale_8h_x64_neon
add x6, x5, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
.if \i < 24
@@ -3166,7 +3132,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_32x64_neon, export=1
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
idct_dc 32, 64, 1
mov x15, x30
@@ -3207,7 +3173,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1
.irp i, 0, 8, 16, 24
add x7, x5, #(\i*2)
mov x8, #32*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
@@ -3216,7 +3182,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_64x16_neon, export=1
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
idct_dc 64, 16, 2
mov x15, x30
@@ -3232,14 +3198,16 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
mov w8, #(16 - \i)
cmp w3, w12
b.lt 1f
- ldrh w12, [x13], #2
.endif
add x7, x2, #(\i*2)
mov x8, #16*2
mov x12, #-2 // shift
- bl inv_txfm_dct_clear_8x64_neon
+ bl inv_txfm_dct_clear_8h_x64_neon
add x6, x4, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+ ldrh w12, [x13], #2
+.endif
.endr
b 3f
@@ -3256,7 +3224,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
b.gt 2b
3:
- adr x5, inv_dct_8x16_neon
+ adr x5, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24, 32, 40, 48, 56
add x6, x0, #(\i)
add x7, x4, #(\i*2)
@@ -3268,7 +3236,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
br x15
endfunc
-function inv_txfm_add_dct_dct_16x64_neon, export=1
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
idct_dc 16, 64, 2
mov x15, x30
@@ -3279,7 +3247,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1
movrel x13, eob_16x32
ldrh w12, [x13], #2
- adr x4, inv_dct_8x16_neon
+ adr x4, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24
add x6, x5, #(\i*16*2)
.if \i > 0
@@ -3310,7 +3278,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1
.irp i, 0, 8
add x7, x5, #(\i*2)
mov x8, #16*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
new file mode 100644
index 00000000000..266f57e36ee
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
@@ -0,0 +1,3526 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+// int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mla \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mls \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ sqadd \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ smax \max, \max, v6.8h
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src
+ load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src
+ load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+ load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+ load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+ load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+ load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+ load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+ load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+ load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+ load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+ load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+ load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
+ load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
+ load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
+ load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src
+ load_add_store , , , , , v31.8h, v30.8h, \dst, \src
+ load_add_store , , , , , , v31.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
+ load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
+ load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
+ load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
+ load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
+ load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ sqadd \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \max
+ smax \max, \max, v6.8h
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
+ load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
+ load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
+ load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+ load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
+ load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
+ load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
+ load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
+ load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src
+ load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src
+ load_add_store4 , , , , , , , , v30.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
+ load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
+ load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
+ load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+ load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src
+ load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src
+ load_add_store4 , , , , , , , , v22.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v0.2s, w16
+ sqrdmulh v20.4s, v16.4s, v0.s[0]
+ str wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v20.4s, v20.4s, v0.s[0]
+.endif
+.if \shift > 0
+ sqrshrn v16.4h, v20.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+.else
+ sqxtn v16.4h, v20.4s
+ sqxtn2 v16.8h, v20.4s
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[1]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.d}[1], [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ sqadd v1.8h, v1.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h}, [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.8h}, [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ ld1 {v2.8h}, [x0], x1
+ sqadd v1.8h, v1.8h, v16.8h
+ ld1 {v3.8h}, [x0], x1
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.8h}, [x0], x1
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h}, [x0], x1
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v16.8h
+ sub x0, x0, x1, lsl #1
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w4, w4, #1
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v16.8h
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x1, x1, #64
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ subs w4, w4, #1
+ sqadd v0.8h, v0.8h, v16.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+ sqadd v1.8h, v1.8h, v16.8h
+ sub x0, x0, #64
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v16.8h
+ sqadd v6.8h, v6.8h, v16.8h
+ sqadd v7.8h, v7.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smax v6.8h, v6.8h, v30.8h
+ smax v7.8h, v7.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ smin v6.8h, v6.8h, v31.8h
+ smin v7.8h, v7.8h, v31.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4s, v16.4s, v17.4s
+ sub v21.4s, v18.4s, v19.4s
+ sub v20.4s, v16.4s, v21.4s
+ sshr v20.4s, v20.4s, #1
+ sub v18.4s, v20.4s, v17.4s
+ sub v17.4s, v20.4s, v19.4s
+ add v19.4s, v21.4s, v18.4s
+ sub v16.4s, v16.4s, v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+ mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
+ mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
+ mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
+ mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
+ srshr v6.4s, v6.4s, #12
+ srshr v7.4s, v4.4s, #12
+ srshr v2.4s, v2.4s, #12
+ srshr v3.4s, v3.4s, #12
+ sqadd \r0\().4s, v2.4s, v6.4s
+ sqsub \r3\().4s, v2.4s, v6.4s
+ sqadd \r1\().4s, v3.4s, v7.4s
+ sqsub \r2\().4s, v3.4s, v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+ idct_4 v16, v17, v18, v19
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.4s}, [x16]
+
+ sub v3.4s, v16.4s, v18.4s
+ mul v4.4s, v16.4s, v0.s[0]
+ mla v4.4s, v18.4s, v0.s[1]
+ mla v4.4s, v19.4s, v0.s[2]
+ mul v7.4s, v17.4s, v0.s[3]
+ add v3.4s, v3.4s, v19.4s
+ mul v5.4s, v16.4s, v0.s[2]
+ mls v5.4s, v18.4s, v0.s[0]
+ mls v5.4s, v19.4s, v0.s[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[3]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ srshr \o0\().4s, \o0\().4s, #12
+ srshr \o2\().4s, \o2\().4s, #12
+ srshr \o1\().4s, \o1\().4s, #12
+ srshr \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x4_neon
+ movz w16, #(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+ sqrdmulh v4.4s, v16.4s, v0.s[0]
+ sqrdmulh v5.4s, v17.4s, v0.s[0]
+ sqrdmulh v6.4s, v18.4s, v0.s[0]
+ sqrdmulh v7.4s, v19.4s, v0.s[0]
+ sqadd v16.4s, v16.4s, v4.4s
+ sqadd v17.4s, v17.4s, v5.4s
+ sqadd v18.4s, v18.4s, v6.4s
+ sqadd v19.4s, v19.4s, v7.4s
+ ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ mov x15, x30
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ sshr v16.4s, v16.4s, #2
+ sshr v17.4s, v17.4s, #2
+ sshr v18.4s, v18.4s, #2
+ sshr v19.4s, v19.4s, #2
+
+ iwht4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v16.4h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqxtn2 v16.8h, v17.4s
+ ld1 {v1.d}[0], [x0], x1
+ sqxtn v18.4h, v18.4s
+ ld1 {v1.d}[1], [x0], x1
+ sqxtn2 v18.8h, v19.4s
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ blr x4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x0, x0, x1, lsl #2
+ sqadd v16.8h, v16.8h, v0.8h
+ sqadd v18.8h, v18.8h, v1.8h
+ smax v16.8h, v16.8h, v30.8h
+ smax v18.8h, v18.8h, v30.8h
+ smin v16.8h, v16.8h, v31.8h
+ st1 {v16.d}[0], [x0], x1
+ smin v18.8h, v18.8h, v31.8h
+ st1 {v16.d}[1], [x0], x1
+ st1 {v18.d}[0], [x0], x1
+ st1 {v18.d}[1], [x0], x1
+
+ br x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v4.2s, w16
+ str wzr, [x2]
+ sqrdmulh v16.4s, v16.4s, v4.s[0]
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v20.4h, v16.4s
+ sqxtn2 v20.8h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqrdmulh v20.8h, v20.8h, v4.h[1]
+ ld1 {v1.d}[0], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.d}[1], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ movi v30.8h, #0
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4s_x4_neon
+ movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4 \r0, \r2, \r4, \r6
+
+ mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
+ mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
+ mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
+ srshr \r1\().4s, v2.4s, #12 // t4a
+ srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r3\().4s, v6.4s, #12 // t5a
+ srshr \r5\().4s, v7.4s, #12 // taa
+
+ sqadd v2.4s, \r1\().4s, \r3\().4s // t4
+ sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
+ sqadd v3.4s, \r7\().4s, \r5\().4s // t7
+ sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
+
+ mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+ mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
+ srshr v4.4s, v4.4s, #12 // t5
+ srshr v5.4s, v6.4s, #12 // t6
+
+ sqsub \r7\().4s, \r0\().4s, v3.4s // out7
+ sqadd \r0\().4s, \r0\().4s, v3.4s // out0
+ sqadd \r1\().4s, \r2\().4s, v5.4s // out1
+ sqsub v6.4s, \r2\().4s, v5.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v4.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r3\().4s, \r6\().4s, v2.4s // out3
+ sqsub \r4\().4s, \r6\().4s, v2.4s // out4
+ mov \r6\().16b, v6.16b // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel x16, iadst8_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v23, v16, v0.s[0], v0.s[1]
+ mul_mls v4, v23, v16, v0.s[1], v0.s[0]
+ mul_mla v6, v21, v18, v0.s[2], v0.s[3]
+ srshr v16.4s, v2.4s, #12 // t0a
+ srshr v23.4s, v4.4s, #12 // t1a
+ mul_mls v2, v21, v18, v0.s[3], v0.s[2]
+ mul_mla v4, v19, v20, v1.s[0], v1.s[1]
+ srshr v18.4s, v6.4s, #12 // t2a
+ srshr v21.4s, v2.4s, #12 // t3a
+ mul_mls v6, v19, v20, v1.s[1], v1.s[0]
+ mul_mla v2, v17, v22, v1.s[2], v1.s[3]
+ srshr v20.4s, v4.4s, #12 // t4a
+ srshr v19.4s, v6.4s, #12 // t5a
+ mul_mls v4, v17, v22, v1.s[3], v1.s[2]
+ srshr v22.4s, v2.4s, #12 // t6a
+ srshr v17.4s, v4.4s, #12 // t7a
+
+ ld1 {v0.4s}, [x16]
+
+ sqadd v2.4s, v16.4s, v20.4s // t0
+ sqsub v3.4s, v16.4s, v20.4s // t4
+ sqadd v4.4s, v23.4s, v19.4s // t1
+ sqsub v5.4s, v23.4s, v19.4s // t5
+ sqadd v6.4s, v18.4s, v22.4s // t2
+ sqsub v7.4s, v18.4s, v22.4s // t6
+ sqadd v18.4s, v21.4s, v17.4s // t3
+ sqsub v19.4s, v21.4s, v17.4s // t7
+
+ mul_mla v16, v3, v5, v0.s[3], v0.s[2]
+ mul_mls v20, v3, v5, v0.s[2], v0.s[3]
+ mul_mls v22, v19, v7, v0.s[3], v0.s[2]
+
+ srshr v3.4s, v16.4s, #12 // t4a
+ srshr v5.4s, v20.4s, #12 // t5a
+
+ mul_mla v16, v19, v7, v0.s[2], v0.s[3]
+
+ srshr v7.4s, v22.4s, #12 // t6a
+ srshr v19.4s, v16.4s, #12 // t7a
+
+ sqadd \o0\().4s, v2.4s, v6.4s // out0
+ sqsub v2.4s, v2.4s, v6.4s // t2
+ sqadd \o7\().4s, v4.4s, v18.4s // out7
+ sqsub v4.4s, v4.4s, v18.4s // t3
+ sqneg \o7\().4s, \o7\().4s // out7
+
+ sqadd \o1\().4s, v3.4s, v7.4s // out1
+ sqsub v3.4s, v3.4s, v7.4s // t6
+ sqadd \o6\().4s, v5.4s, v19.4s // out6
+ sqsub v5.4s, v5.4s, v19.4s // t7
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+ mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+ mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+ srshr v2.4s, v18.4s, #12 // out3
+ mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+ srshr v3.4s, v20.4s, #12 // out5
+ srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+ srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19)
+
+ sqneg \o3\().4s, v2.4s // out3
+ sqneg \o5\().4s, v3.4s // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x8_neon
+ sqshl v16.4s, v16.4s, #1
+ sqshl v17.4s, v17.4s, #1
+ sqshl v18.4s, v18.4s, #1
+ sqshl v19.4s, v19.4s, #1
+ sqshl v20.4s, v20.4s, #1
+ sqshl v21.4s, v21.4s, #1
+ sqshl v22.4s, v22.4s, #1
+ sqshl v23.4s, v23.4s, #1
+ ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+ movi v31.4s, #0
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23
+
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v27.16b
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ br x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ movrel x5, X(inv_\txfm2\()_8h_x8_neon)
+ mov w13, #\eob_half
+ adr x4, inv_\txfm1\()_4s_x8_neon
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn v22.4h, v22.4s
+ sqxtn v23.4h, v23.4s
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ br x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movz w16, #2896*8, lsl #16
+ movi v31.4s, #0
+ dup v30.2s, w16
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v20.4h, v16.4s
+ sqxtn v21.4h, v17.4s
+ sqxtn v22.4h, v18.4s
+ sqxtn v23.4h, v19.4s
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+
+1:
+.irp i, v20, v21, v22, v23
+ movi \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ br x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov w13, #\eob_half
+.endif
+ movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #32
+
+ mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
+ mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
+ srshr v17.4s, v2.4s, #12 // t8a
+ srshr v31.4s, v4.4s, #12 // t15a
+ mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
+ mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ srshr v23.4s, v6.4s, #12 // t9a
+ srshr v25.4s, v2.4s, #12 // t14a
+ mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
+ mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
+ srshr v21.4s, v4.4s, #12 // t10a
+ srshr v27.4s, v6.4s, #12 // t13a
+ mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ srshr v19.4s, v2.4s, #12 // t11a
+ srshr v29.4s, v4.4s, #12 // t12a
+
+ ld1 {v0.4s}, [x16]
+
+ sqsub v2.4s, v17.4s, v23.4s // t9
+ sqadd v17.4s, v17.4s, v23.4s // t8
+ sqsub v3.4s, v31.4s, v25.4s // t14
+ sqadd v31.4s, v31.4s, v25.4s // t15
+ sqsub v23.4s, v19.4s, v21.4s // t10
+ sqadd v19.4s, v19.4s, v21.4s // t11
+ sqadd v25.4s, v29.4s, v27.4s // t12
+ sqsub v29.4s, v29.4s, v27.4s // t13
+
+ mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+ mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
+ srshr v21.4s, v4.4s, #12 // t9a
+ srshr v27.4s, v6.4s, #12 // t14a
+
+ mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
+ srshr v29.4s, v4.4s, #12 // t13a
+ neg v6.4s, v6.4s
+ srshr v23.4s, v6.4s, #12 // t10a
+
+ sqsub v2.4s, v17.4s, v19.4s // t11a
+ sqadd v17.4s, v17.4s, v19.4s // t8a
+ sqsub v3.4s, v31.4s, v25.4s // t12a
+ sqadd v31.4s, v31.4s, v25.4s // t15a
+ sqadd v19.4s, v21.4s, v23.4s // t9
+ sqsub v21.4s, v21.4s, v23.4s // t10
+ sqsub v25.4s, v27.4s, v29.4s // t13
+ sqadd v27.4s, v27.4s, v29.4s // t14
+
+ mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
+ mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+ srshr v4.4s, v4.4s, #12 // t11
+ srshr v5.4s, v6.4s, #12 // t12
+ mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a
+ srshr v2.4s, v2.4s, #12 // t10a
+ srshr v3.4s, v6.4s, #12 // t13a
+
+ sqadd v6.4s, v16.4s, v31.4s // out0
+ sqsub v31.4s, v16.4s, v31.4s // out15
+ mov v16.16b, v6.16b
+ sqadd v23.4s, v30.4s, v17.4s // out7
+ sqsub v7.4s, v30.4s, v17.4s // out8
+ sqadd v17.4s, v18.4s, v27.4s // out1
+ sqsub v30.4s, v18.4s, v27.4s // out14
+ sqadd v18.4s, v20.4s, v3.4s // out2
+ sqsub v29.4s, v20.4s, v3.4s // out13
+ sqadd v3.4s, v28.4s, v19.4s // out6
+ sqsub v25.4s, v28.4s, v19.4s // out9
+ sqadd v19.4s, v22.4s, v5.4s // out3
+ sqsub v28.4s, v22.4s, v5.4s // out12
+ sqadd v20.4s, v24.4s, v4.4s // out4
+ sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v21.4s, v26.4s, v2.4s // out5
+ sqsub v26.4s, v26.4s, v2.4s // out10
+ mov v24.16b, v7.16b
+ mov v22.16b, v3.16b
+
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel x16, iadst16_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0
+ mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2
+ srshr v16.4s, v2.4s, #12 // t0
+ srshr v31.4s, v4.4s, #12 // t1
+ mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3
+ mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4
+ srshr v18.4s, v6.4s, #12 // t2
+ srshr v29.4s, v2.4s, #12 // t3
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5
+ mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6
+ srshr v20.4s, v4.4s, #12 // t4
+ srshr v27.4s, v6.4s, #12 // t5
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7
+ ld1 {v0.4s, v1.4s}, [x16]
+ movrel x16, idct_coeffs
+ mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8
+ srshr v22.4s, v2.4s, #12 // t6
+ srshr v25.4s, v4.4s, #12 // t7
+ mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9
+ mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10
+ srshr v23.4s, v6.4s, #12 // t8
+ srshr v24.4s, v2.4s, #12 // t9
+ mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11
+ mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12
+ srshr v21.4s, v4.4s, #12 // t10
+ srshr v26.4s, v6.4s, #12 // t11
+ mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13
+ mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14
+ srshr v19.4s, v2.4s, #12 // t12
+ srshr v28.4s, v4.4s, #12 // t13
+ mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15
+ srshr v17.4s, v6.4s, #12 // t14
+ srshr v30.4s, v2.4s, #12 // t15
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ sqsub v2.4s, v16.4s, v23.4s // t8a
+ sqadd v16.4s, v16.4s, v23.4s // t0a
+ sqsub v3.4s, v31.4s, v24.4s // t9a
+ sqadd v31.4s, v31.4s, v24.4s // t1a
+ sqadd v23.4s, v18.4s, v21.4s // t2a
+ sqsub v18.4s, v18.4s, v21.4s // t10a
+ sqadd v24.4s, v29.4s, v26.4s // t3a
+ sqsub v29.4s, v29.4s, v26.4s // t11a
+ sqadd v21.4s, v20.4s, v19.4s // t4a
+ sqsub v20.4s, v20.4s, v19.4s // t12a
+ sqadd v26.4s, v27.4s, v28.4s // t5a
+ sqsub v27.4s, v27.4s, v28.4s // t13a
+ sqadd v19.4s, v22.4s, v17.4s // t6a
+ sqsub v22.4s, v22.4s, v17.4s // t14a
+ sqadd v28.4s, v25.4s, v30.4s // t7a
+ sqsub v25.4s, v25.4s, v30.4s // t15a
+
+ mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
+ mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
+ mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
+ srshr v17.4s, v4.4s, #12 // t8
+ srshr v30.4s, v6.4s, #12 // t9
+ mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12
+ srshr v18.4s, v2.4s, #12 // t10
+ srshr v29.4s, v4.4s, #12 // t11
+ mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14
+ srshr v27.4s, v6.4s, #12 // t12
+ srshr v20.4s, v2.4s, #12 // t13
+ mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15
+ srshr v25.4s, v4.4s, #12 // t14
+ srshr v22.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t4
+ sqadd v16.4s, v16.4s, v21.4s // t0
+ sqsub v3.4s, v31.4s, v26.4s // t5
+ sqadd v31.4s, v31.4s, v26.4s // t1
+ sqadd v21.4s, v23.4s, v19.4s // t2
+ sqsub v23.4s, v23.4s, v19.4s // t6
+ sqadd v26.4s, v24.4s, v28.4s // t3
+ sqsub v24.4s, v24.4s, v28.4s // t7
+ sqadd v19.4s, v17.4s, v27.4s // t8a
+ sqsub v17.4s, v17.4s, v27.4s // t12a
+ sqadd v28.4s, v30.4s, v20.4s // t9a
+ sqsub v30.4s, v30.4s, v20.4s // t13a
+ sqadd v27.4s, v18.4s, v25.4s // t10a
+ sqsub v18.4s, v18.4s, v25.4s // t14a
+ sqadd v20.4s, v29.4s, v22.4s // t11a
+ sqsub v29.4s, v29.4s, v22.4s // t15a
+
+ mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
+ mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
+ mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
+ srshr v22.4s, v4.4s, #12 // t4a
+ srshr v25.4s, v6.4s, #12 // t5a
+ mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a
+ mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12
+ srshr v24.4s, v2.4s, #12 // t6a
+ srshr v23.4s, v4.4s, #12 // t7a
+ mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13
+ mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14
+ srshr v17.4s, v6.4s, #12 // t12
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15
+ srshr v29.4s, v2.4s, #12 // t13
+ srshr v30.4s, v4.4s, #12 // t14
+ srshr v18.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t2a
+.ifc \o0, v16
+ sqadd \o0\().4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+.else
+ sqadd v4.4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+ mov \o0\().16b, v4.16b
+.endif
+ sqneg \o15\().4s, \o15\().4s // out15
+
+ sqsub v3.4s, v29.4s, v18.4s // t15a
+ sqadd \o13\().4s, v29.4s, v18.4s // out13
+ sqadd \o2\().4s, v17.4s, v30.4s // out2
+ sqsub v26.4s, v17.4s, v30.4s // t14a
+ sqneg \o13\().4s, \o13\().4s // out13
+
+ sqadd \o1\().4s, v19.4s, v27.4s // out1
+ sqsub v27.4s, v19.4s, v27.4s // t10
+ sqadd \o14\().4s, v28.4s, v20.4s // out14
+ sqsub v20.4s, v28.4s, v20.4s // t11
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ sqadd \o3\().4s, v22.4s, v24.4s // out3
+ sqsub v22.4s, v22.4s, v24.4s // t6
+ sqadd \o12\().4s, v25.4s, v23.4s // out12
+ sqsub v23.4s, v25.4s, v23.4s // t7
+ sqneg \o3\().4s, \o3\().4s // out3
+
+ mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+ mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+ mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+ srshr v24.4s, v24.4s, #12 // out8
+ srshr v4.4s, v4.4s, #12 // out7
+ srshr v5.4s, v6.4s, #12 // out5
+ mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+ mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+ srshr v26.4s, v6.4s, #12 // out10
+
+ mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+ mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+ mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+ srshr \o4\().4s, v2.4s, #12 // out4
+ srshr v6.4s, v6.4s, #12 // out11
+ srshr v7.4s, v21.4s, #12 // out9
+ srshr \o6\().4s, v22.4s, #12 // out6
+
+.ifc \o8, v23
+ mov \o8\().16b, v24.16b
+ mov \o10\().16b, v26.16b
+.endif
+
+ sqneg \o7\().4s, v4.4s // out7
+ sqneg \o5\().4s, v5.4s // out5
+ sqneg \o11\().4s, v6.4s // out11
+ sqneg \o9\().4s, v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x16_neon
+ movz w16, #2*(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4s, v\i\().4s, v0.s[0]
+ sqadd v\i\().4s, v\i\().4s, v\i\().4s
+ sqadd v\i\().4s, v\i\().4s, v2.4s
+.endr
+ ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ srshr v3.4s, v3.4s, #1
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ blr x4
+ sqrshrn v16.4h, v16.4s, #\shift
+ sqrshrn v17.4h, v17.4s, #\shift
+ sqrshrn v18.4h, v18.4s, #\shift
+ sqrshrn v19.4h, v19.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+ sqrshrn2 v17.8h, v21.4s, #\shift
+ sqrshrn2 v18.8h, v22.4s, #\shift
+ sqrshrn2 v19.8h, v23.4s, #\shift
+ sqrshrn v20.4h, v24.4s, #\shift
+ sqrshrn v21.4h, v25.4s, #\shift
+ sqrshrn v22.4h, v26.4s, #\shift
+ sqrshrn v23.4h, v27.4s, #\shift
+ sqrshrn2 v20.8h, v28.4s, #\shift
+ sqrshrn2 v21.8h, v29.4s, #\shift
+ sqrshrn2 v22.8h, v30.4s, #\shift
+ sqrshrn2 v23.8h, v31.4s, #\shift
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ br x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ br x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+ ldrh w12, [x13], #2
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*16*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ adr x4, inv_\txfm1\()_4s_x16_neon
+ movrel x5, X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_16x16
+.else
+ movrel x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_16x16_identity
+.else
+ movrel x13, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ mov x15, x30
+ movi v4.4s, #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], #16
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ sqrshrn v16.4h, v24.4s, #1
+ sqrshrn v17.4h, v25.4s, #1
+ sqrshrn v18.4h, v26.4s, #1
+ sqrshrn v19.4h, v27.4s, #1
+ sqrshrn2 v16.8h, v28.4s, #1
+ sqrshrn2 v17.8h, v29.4s, #1
+ sqrshrn2 v18.8h, v30.4s, #1
+ sqrshrn2 v19.8h, v31.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #16
+ load_add_store_8x4 x6, x7
+
+ br x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh w12, [x13, #4]
+ mov x15, x30
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v28.4h, v16.4s, #1
+ rshrn v29.4h, v17.4s, #1
+ rshrn v30.4h, v18.4s, #1
+ rshrn v31.4h, v19.4s, #1
+ transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v24.4h, v16.4s, #1
+ rshrn v25.4h, v17.4s, #1
+ rshrn v26.4h, v18.4s, #1
+ rshrn v27.4h, v19.4s, #1
+ transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v20.4h, v16.4s, #1
+ rshrn v21.4h, v17.4s, #1
+ rshrn v22.4h, v18.4s, #1
+ rshrn v23.4h, v19.4s, #1
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ movi \i, #0
+.endr
+2:
+
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v2.4s}, [x2], x11
+.endr
+ blr x4
+ rshrn v16.4h, v16.4s, #1
+ rshrn v17.4h, v17.4s, #1
+ rshrn v18.4h, v18.4s, #1
+ rshrn v19.4h, v19.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ br x15
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_4x16
+.else
+ movrel x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_4x16_identity2
+.else
+ movrel x13, eob_4x16
+.endif
+.endif
+.else
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ sqrshrn v12.4h, v24.4s, #1
+ sqrshrn v13.4h, v25.4s, #1
+ sqrshrn v14.4h, v26.4s, #1
+ sqrshrn v15.4h, v27.4s, #1
+ sqrshrn2 v12.8h, v28.4s, #1
+ sqrshrn2 v13.8h, v29.4s, #1
+ sqrshrn2 v14.8h, v30.4s, #1
+ sqrshrn2 v15.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+ transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+
+ b 2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+ movi \i, #0
+.endr
+2:
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ movi v4.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ sqrshrn v8.4h, v24.4s, #1
+ sqrshrn v9.4h, v25.4s, #1
+ sqrshrn v10.4h, v26.4s, #1
+ sqrshrn v11.4h, v27.4s, #1
+ sqrshrn2 v8.8h, v28.4s, #1
+ sqrshrn2 v9.8h, v29.4s, #1
+ sqrshrn2 v10.8h, v30.4s, #1
+ sqrshrn2 v11.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+
+ blr x5
+
+ add x0, x0, #16
+ load_add_store_8x8 x0, x7
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ br x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
+ ldrh w12, [x13, #4]
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ sqrshrn2 v28.8h, v20.4s, #1
+ sqrshrn2 v29.8h, v21.4s, #1
+ sqrshrn2 v30.8h, v22.4s, #1
+ sqrshrn2 v31.8h, v23.4s, #1
+ transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x20
+
+ br x15
+endfunc
+
+const eob_8x16
+ .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+ .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+ .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_8x16
+.else
+ movrel x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_8x16_identity2
+.else
+ movrel x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+ ldrh w13, [x13]
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+ movrel x16, idct_coeffs, 4*16
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a
+ mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a
+ mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a
+ srshr v16.4s, v2.4s, #12 // t16a
+ srshr v31.4s, v4.4s, #12 // t31a
+ mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a
+ mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a
+ srshr v24.4s, v6.4s, #12 // t17a
+ srshr v23.4s, v2.4s, #12 // t30a
+ mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a
+ mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a
+ srshr v20.4s, v4.4s, #12 // t18a
+ srshr v27.4s, v6.4s, #12 // t29a
+ mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #4*24
+ mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a
+ srshr v28.4s, v2.4s, #12 // t19a
+ srshr v19.4s, v4.4s, #12 // t28a
+ mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a
+ mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a
+ srshr v18.4s, v6.4s, #12 // t20a
+ srshr v29.4s, v2.4s, #12 // t27a
+ mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a
+ mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a
+ srshr v26.4s, v4.4s, #12 // t21a
+ srshr v21.4s, v6.4s, #12 // t26a
+ mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a
+ mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a
+ srshr v22.4s, v2.4s, #12 // t22a
+ srshr v25.4s, v4.4s, #12 // t25a
+ mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a
+ srshr v30.4s, v6.4s, #12 // t23a
+ srshr v17.4s, v2.4s, #12 // t24a
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ sqsub v2.4s, v16.4s, v24.4s // t17
+ sqadd v16.4s, v16.4s, v24.4s // t16
+ sqsub v3.4s, v31.4s, v23.4s // t30
+ sqadd v31.4s, v31.4s, v23.4s // t31
+ sqsub v24.4s, v28.4s, v20.4s // t18
+ sqadd v28.4s, v28.4s, v20.4s // t19
+ sqadd v23.4s, v18.4s, v26.4s // t20
+ sqsub v18.4s, v18.4s, v26.4s // t21
+ sqsub v20.4s, v30.4s, v22.4s // t22
+ sqadd v30.4s, v30.4s, v22.4s // t23
+ sqadd v26.4s, v17.4s, v25.4s // t24
+ sqsub v17.4s, v17.4s, v25.4s // t25
+ sqsub v22.4s, v29.4s, v21.4s // t26
+ sqadd v29.4s, v29.4s, v21.4s // t27
+ sqadd v25.4s, v19.4s, v27.4s // t28
+ sqsub v19.4s, v19.4s, v27.4s // t29
+
+ mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+ mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
+ mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
+ srshr v21.4s, v4.4s, #12 // t17a
+ srshr v27.4s, v6.4s, #12 // t30a
+ neg v2.4s, v2.4s // -> t18a
+ mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
+ srshr v19.4s, v2.4s, #12 // t18a
+ srshr v24.4s, v4.4s, #12 // t29a
+ mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
+ mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ srshr v22.4s, v6.4s, #12 // t21a
+ srshr v18.4s, v2.4s, #12 // t26a
+ neg v4.4s, v4.4s // -> t22a
+ mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
+ srshr v17.4s, v4.4s, #12 // t22a
+ srshr v20.4s, v6.4s, #12 // t25a
+
+ sqsub v2.4s, v27.4s, v24.4s // t29
+ sqadd v27.4s, v27.4s, v24.4s // t30
+ sqsub v3.4s, v21.4s, v19.4s // t18
+ sqadd v21.4s, v21.4s, v19.4s // t17
+ sqsub v24.4s, v16.4s, v28.4s // t19a
+ sqadd v16.4s, v16.4s, v28.4s // t16a
+ sqsub v19.4s, v30.4s, v23.4s // t20a
+ sqadd v30.4s, v30.4s, v23.4s // t23a
+ sqsub v28.4s, v17.4s, v22.4s // t21
+ sqadd v17.4s, v17.4s, v22.4s // t22
+ sqadd v23.4s, v26.4s, v29.4s // t24a
+ sqsub v26.4s, v26.4s, v29.4s // t27a
+ sqadd v22.4s, v20.4s, v18.4s // t25
+ sqsub v20.4s, v20.4s, v18.4s // t26
+ sqsub v29.4s, v31.4s, v25.4s // t28a
+ sqadd v31.4s, v31.4s, v25.4s // t31a
+
+ mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+ mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
+ mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
+ srshr v18.4s, v4.4s, #12 // t18a
+ srshr v25.4s, v6.4s, #12 // t29a
+ mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
+ srshr v29.4s, v2.4s, #12 // t19
+ srshr v24.4s, v4.4s, #12 // t28
+ neg v6.4s, v6.4s // -> t20
+ mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
+ mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ srshr v26.4s, v6.4s, #12 // t20
+ srshr v19.4s, v2.4s, #12 // t27
+ neg v4.4s, v4.4s // -> t21a
+ mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
+ srshr v20.4s, v4.4s, #12 // t21a
+ srshr v28.4s, v6.4s, #12 // t26a
+
+ sqsub v2.4s, v16.4s, v30.4s // t23
+ sqadd v16.4s, v16.4s, v30.4s // t16 = out16
+ sqsub v3.4s, v31.4s, v23.4s // t24
+ sqadd v31.4s, v31.4s, v23.4s // t31 = out31
+ sqsub v23.4s, v21.4s, v17.4s // t22a
+ sqadd v17.4s, v21.4s, v17.4s // t17a = out17
+ sqadd v30.4s, v27.4s, v22.4s // t30a = out30
+ sqsub v21.4s, v27.4s, v22.4s // t25a
+ sqsub v27.4s, v18.4s, v20.4s // t21
+ sqadd v18.4s, v18.4s, v20.4s // t18 = out18
+ sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqsub v26.4s, v29.4s, v26.4s // t20a
+ sqadd v29.4s, v25.4s, v28.4s // t29 = out29
+ sqsub v25.4s, v25.4s, v28.4s // t26
+ sqadd v28.4s, v24.4s, v19.4s // t28a = out28
+ sqsub v24.4s, v24.4s, v19.4s // t27a
+ mov v19.16b, v4.16b // out19
+
+ mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+ mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
+ srshr v20.4s, v4.4s, #12 // t20
+ srshr v22.4s, v6.4s, #12 // t27
+
+ mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
+ mov v27.16b, v22.16b // t27
+ srshr v26.4s, v4.4s, #12 // t26a
+
+ mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+ mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ srshr v21.4s, v6.4s, #12 // t21a
+ srshr v22.4s, v24.4s, #12 // t22
+ srshr v25.4s, v4.4s, #12 // t25
+
+ mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
+ srshr v23.4s, v4.4s, #12 // t23a
+ srshr v24.4s, v6.4s, #12 // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+ lsl x8, x8, #1
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_4s_x16_neon
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
+ transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5
+
+.macro store1 r0, r1, r2, r3
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ st1 {\r2}, [x6], #16
+ st1 {\r3}, [x6], #16
+.endm
+ store1 v16.4s, v20.4s, v24.4s, v28.4s
+ store1 v17.4s, v21.4s, v25.4s, v29.4s
+ store1 v18.4s, v22.4s, v26.4s, v30.4s
+ store1 v19.4s, v23.4s, v27.4s, v31.4s
+.purgem store1
+ sub x6, x6, #64*4
+
+ movi v7.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+ scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_4s_x16_neon
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+ transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5
+ transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5
+.macro store2 r0, r1, r2, r3, shift
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ sqsub v4.4s, v0.4s, \r0
+ sqadd v0.4s, v0.4s, \r0
+ sqsub v5.4s, v1.4s, \r1
+ sqadd v1.4s, v1.4s, \r1
+ sqsub v6.4s, v2.4s, \r2
+ sqadd v2.4s, v2.4s, \r2
+ sqsub v7.4s, v3.4s, \r3
+ sqadd v3.4s, v3.4s, \r3
+ sqrshrn v0.4h, v0.4s, #\shift
+ sqrshrn2 v0.8h, v1.4s, #\shift
+ sqrshrn v1.4h, v2.4s, #\shift
+ sqrshrn2 v1.8h, v3.4s, #\shift
+ sqrshrn v2.4h, v7.4s, #\shift
+ sqrshrn2 v2.8h, v6.4s, #\shift
+ sqrshrn v3.4h, v5.4s, #\shift
+ sqrshrn2 v3.8h, v4.4s, #\shift
+ st1 {v0.8h, v1.8h}, [x6], #32
+ rev64 v2.8h, v2.8h
+ rev64 v3.8h, v3.8h
+ st1 {v2.8h, v3.8h}, [x6], #32
+.endm
+
+ store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift
+ store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift
+ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift
+ store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift
+.purgem store2
+ br x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl X(inv_dct32_odd_8h_x16_neon)
+
+ neg x9, x8
+ mov x10, x6
+ movi v0.8h, #0
+ mvni v1.8h, #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8h}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8h}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8h}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ sqadd v5.8h, v5.8h, v2.8h
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ smax v2.8h, v5.8h, v0.8h
+ ld1 {v5.8h}, [x7], \stride
+ sqadd v6.8h, v6.8h, v3.8h
+ smin v2.8h, v2.8h, v1.8h
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8h}, [x6], x1
+ ld1 {v2.8h}, [x10], x1
+ smax v3.8h, v6.8h, v0.8h
+ sqadd v7.8h, v7.8h, v4.8h
+ smin v3.8h, v3.8h, v1.8h
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8h}, [x6], x1
+ smax v4.8h, v7.8h, v0.8h
+ sqadd v5.8h, v5.8h, v2.8h
+ smin v4.8h, v4.8h, v1.8h
+ st1 {v4.8h}, [x6], x1
+ smax v2.8h, v5.8h, v0.8h
+ smin v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ br x14
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ movi v0.8h, #0
+ movi v1.8h, #0
+ movrel x13, eob_32x32, 2
+
+ mov x8, #4*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movz w16, #2896*8, lsl #16
+ movz w17, #2*(5793-4096)*8, lsl #16
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movrel x13, eob_16x32\hshort, 2
+
+ mov x8, #4*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ dup v2.2s, w16
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ mov v2.s[1], w17
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+ // 16x32
+ identity_4x16_shift1 v2.s[1]
+.else
+ // 32x16
+ shift_16_regs sqshl, 1
+ identity_4x16 v2.s[1]
+.endif
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #16
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movi v0.4s, #0
+ movi v1.4s, #0
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ movrel x13, eob_8x32, 2
+
+ mov w8, #4*\h
+1:
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ ldrh w12, [x13], #4
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+
+.if \w == 8
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn2 v16.8h, v17.4s, #1
+ sqrshrn v17.4h, v18.4s, #1
+ sqrshrn2 v17.8h, v19.4s, #1
+ sqrshrn v18.4h, v20.4s, #1
+ sqrshrn2 v18.8h, v21.4s, #1
+ sqrshrn v19.4h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ sqrshrn v20.4h, v24.4s, #1
+ sqrshrn2 v20.8h, v25.4s, #1
+ sqrshrn v21.4h, v26.4s, #1
+ sqrshrn2 v21.8h, v27.4s, #1
+ sqrshrn v22.4h, v28.4s, #1
+ sqrshrn2 v22.8h, v29.4s, #1
+ sqrshrn v23.4h, v30.4s, #1
+ sqrshrn2 v23.8h, v31.4s, #1
+.else
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #4*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ movrel x13, eob_16x32
+ movrel x5, X(inv_dct_8h_x16_neon)
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ mov x8, #4*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.4s, #0
+ mov x8, #4*32
+ mov w9, #32
+ mov x6, sp
+ mov x7, x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().4s}, [x7]
+ st1 {v28.4s}, [x7], x8
+.endr
+ ldrh w12, [x13], #2
+ sub w9, w9, #4
+ sub x7, x7, x8, lsl #3
+ add x7, x7, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn v17.4h, v17.4s, #2
+ sqrshrn v18.4h, v18.4s, #2
+ sqrshrn v19.4h, v19.4s, #2
+ sqrshrn2 v16.8h, v20.4s, #2
+ sqrshrn2 v17.8h, v21.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+ sqrshrn2 v19.8h, v23.4s, #2
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ cmp w3, w12
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #4
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+.irp i, 0, 4
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ cmp w3, #10
+ b.lt 1f
+.endif
+ mov x8, #8*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9, lsl #1
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.4s, v1.4s}, [x17], #32
+
+ sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a
+ sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a
+ sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a
+ sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a
+ sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a
+ sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a
+ sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a
+ sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a
+
+ ld1 {v0.4s}, [x17], #16
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t33
+ sqsub v26.4s, v19.4s, v18.4s // t34
+ sqadd v27.4s, v19.4s, v18.4s // t35
+ sqadd v28.4s, v20.4s, v21.4s // t60
+ sqsub v29.4s, v20.4s, v21.4s // t61
+ sqsub v30.4s, v23.4s, v22.4s // t62
+ sqadd v31.4s, v23.4s, v22.4s // t63
+
+ mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
+ mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ neg v2.4s, v2.4s // t34a
+ mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
+ srshr v26.4s, v2.4s, #12 // t34a
+ mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
+ srshr v29.4s, v4.4s, #12 // t61a
+ srshr v25.4s, v6.4s, #12 // t33a
+ srshr v30.4s, v2.4s, #12 // t62a
+
+ sqadd v16.4s, v24.4s, v27.4s // t32a
+ sqsub v19.4s, v24.4s, v27.4s // t35a
+ sqadd v17.4s, v25.4s, v26.4s // t33
+ sqsub v18.4s, v25.4s, v26.4s // t34
+ sqsub v20.4s, v31.4s, v28.4s // t60a
+ sqadd v23.4s, v31.4s, v28.4s // t63a
+ sqsub v21.4s, v30.4s, v29.4s // t61
+ sqadd v22.4s, v30.4s, v29.4s // t62
+
+ mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
+ mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
+ srshr v21.4s, v2.4s, #12 // t61a
+ srshr v18.4s, v4.4s, #12 // t34a
+ mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
+ srshr v20.4s, v6.4s, #12 // t60
+ srshr v19.4s, v2.4s, #12 // t35
+
+ st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+ st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #4*4*0] // t32a
+ ldr q17, [x9, #4*4*8] // t39a
+ ldr q18, [x9, #4*4*0] // t63a
+ ldr q19, [x6, #4*4*8] // t56a
+ ldr q20, [x6, #4*4*16] // t40a
+ ldr q21, [x9, #4*4*24] // t47a
+ ldr q22, [x9, #4*4*16] // t55a
+ ldr q23, [x6, #4*4*24] // t48a
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t39
+ sqadd v26.4s, v18.4s, v19.4s // t63
+ sqsub v27.4s, v18.4s, v19.4s // t56
+ sqsub v28.4s, v21.4s, v20.4s // t40
+ sqadd v29.4s, v21.4s, v20.4s // t47
+ sqadd v30.4s, v23.4s, v22.4s // t48
+ sqsub v31.4s, v23.4s, v22.4s // t55
+
+ mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
+ mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
+ srshr v25.4s, v2.4s, #12 // t56a
+ srshr v27.4s, v4.4s, #12 // t39a
+ neg v6.4s, v6.4s // t40a
+ mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
+ srshr v31.4s, v6.4s, #12 // t40a
+ srshr v28.4s, v2.4s, #12 // t55a
+
+ sqadd v16.4s, v24.4s, v29.4s // t32a
+ sqsub v19.4s, v24.4s, v29.4s // t47a
+ sqadd v17.4s, v27.4s, v31.4s // t39
+ sqsub v18.4s, v27.4s, v31.4s // t40
+ sqsub v20.4s, v26.4s, v30.4s // t48a
+ sqadd v23.4s, v26.4s, v30.4s // t63a
+ sqsub v21.4s, v25.4s, v28.4s // t55
+ sqadd v22.4s, v25.4s, v28.4s // t56
+
+ mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
+ mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
+ srshr v18.4s, v2.4s, #12 // t40a
+ srshr v21.4s, v4.4s, #12 // t55a
+ mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
+ srshr v19.4s, v6.4s, #12 // t47
+ srshr v20.4s, v2.4s, #12 // t48
+
+ str q16, [x6, #4*4*0] // t32a
+ str q17, [x9, #4*4*0] // t39
+ str q18, [x6, #4*4*8] // t40a
+ str q19, [x9, #4*4*8] // t47
+ str q20, [x6, #4*4*16] // t48
+ str q21, [x9, #4*4*16] // t55a
+ str q22, [x6, #4*4*24] // t56
+ str q23, [x9, #4*4*24] // t63a
+
+ add x6, x6, #4*4
+ sub x9, x9, #4*4
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+ movz \gpr, \val, lsl #16
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_4s_x16_neon
+
+ store16 x6
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_4s_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.4s}, [x6], #16
+ ld1 {v3.4s}, [x6], #16
+ sqadd v6.4s, v2.4s, \r0
+ sqsub \r0, v2.4s, \r0
+ ld1 {v4.4s}, [x6], #16
+ sqadd v7.4s, v3.4s, \r1
+ sqsub \r1, v3.4s, \r1
+ ld1 {v5.4s}, [x6], #16
+ sqadd v2.4s, v4.4s, \r2
+ sub x6, x6, #16*4
+ sqsub \r2, v4.4s, \r2
+ st1 {v6.4s}, [x6], #16
+ st1 {\r0}, [x10], x9
+ sqadd v3.4s, v5.4s, \r3
+ sqsub \r3, v5.4s, \r3
+ st1 {v7.4s}, [x6], #16
+ st1 {\r1}, [x10], x9
+ st1 {v2.4s}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.4s}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.4s, v30.4s, v29.4s, v28.4s
+ store_addsub v27.4s, v26.4s, v25.4s, v24.4s
+ store_addsub v23.4s, v22.4s, v21.4s, v20.4s
+ store_addsub v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+ add x6, x6, #4*4*16
+
+ movrel x17, idct64_coeffs
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.4s}, [x7] // in1 (offset 0)
+ ld1 {v17.4s}, [x9] // in31 (offset 15)
+ ld1 {v18.4s}, [x10] // in17 (offset 8)
+ ld1 {v19.4s}, [x11] // in15 (offset 7)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.4s}, [x10] // in7 (offset 3)
+ ld1 {v17.4s}, [x11] // in25 (offset 12)
+ ld1 {v18.4s}, [x9] // in23 (offset 11)
+ ld1 {v19.4s}, [x7] // in9 (offset 4)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #4*4*32
+ add x9, x6, #4*4*7
+
+ bl inv_dct64_step2_neon
+
+ br x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #4*4*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-4*4*4
+
+ dup v7.4s, w12
+1:
+ ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+ ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.4s, \src0, \src1
+ sqadd v0.4s, \src0, \src1
+ sqsub v3.4s, \src2, \src3
+ srshl v1.4s, v1.4s, v7.4s
+ sqadd v2.4s, \src2, \src3
+ srshl v3.4s, v3.4s, v7.4s
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v2.4s, v2.4s, v7.4s
+ sqxtn v3.4h, v3.4s
+ sqxtn2 v3.8h, v1.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v2.4s
+ rev64 v3.8h, v3.8h
+ st1 {v0.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.4s, v31.4s, v20.4s, v27.4s
+ store_addsub v17.4s, v30.4s, v21.4s, v26.4s
+ store_addsub v18.4s, v29.4s, v22.4s, v25.4s
+ store_addsub v19.4s, v28.4s, v23.4s, v24.4s
+.purgem store_addsub
+ sub x6, x6, x10, lsl #2
+ sub x9, x9, x10, lsl #2
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ br x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8h}, [x6], x1
+ ld1 {v1.8h}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8h}, [x6]
+ sqsub \src0, \src0, \src1
+ ld1 {v3.8h}, [x9]
+ sqadd v5.8h, \src2, \src3
+ sqsub \src2, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr \src0, \src0, #4
+ sqadd v0.8h, v0.8h, v4.8h
+ srshr \src2, \src2, #4
+ sqadd v1.8h, v1.8h, \src0
+ sqadd v2.8h, v2.8h, v5.8h
+ smax v0.8h, v0.8h, v6.8h
+ sqadd v3.8h, v3.8h, \src2
+ smax v1.8h, v1.8h, v6.8h
+ smin v0.8h, v0.8h, v7.8h
+ smax v2.8h, v2.8h, v6.8h
+ smin v1.8h, v1.8h, v7.8h
+ st1 {v0.8h}, [x6], x1
+ smax v3.8h, v3.8h, v6.8h
+ smin v2.8h, v2.8h, v7.8h
+ st1 {v1.8h}, [x9], x10
+ smin v3.8h, v3.8h, v7.8h
+ st1 {v2.8h}, [x6], x1
+ st1 {v3.8h}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ br x14
+endfunc
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub x16, sp, #4096
+ ldr xzr, [x16]
+ sub sp, x16, #(\space - 4096)
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*4*4
+ add x4, sp, #64*4*4
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ movrel x5, X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ br x15
+endfunc
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S
index 31cc46f8971..3a6cf900a97 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S
@@ -118,9 +118,9 @@ endconst
.endm
.macro str_n idx0, idx1, dstreg, dstoff, n
- str q\idx0, [\dstreg, \dstoff]
+ str \idx0, [\dstreg, \dstoff]
.if \n == 16
- str q\idx1, [\dstreg, \dstoff + 16]
+ str \idx1, [\dstreg, \dstoff + 16]
.endif
.endm
@@ -150,7 +150,7 @@ function msac_decode_symbol_adapt4_neon, export=1
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
movrel x8, bits
- str_n 4, 5, sp, #16, \n // store v values to allow indexed access
+ str_n q4, q5, sp, #16, \n // store v values to allow indexed access
ld1_n v16, v17, x8, .8h, \n
@@ -185,7 +185,7 @@ function msac_decode_symbol_adapt4_neon, export=1
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
.endif
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
- dup v6.8h, w4 // -rate
+ dup v6\sz, w4 // -rate
sub w3, w3, w3, lsr #5 // count - (count == 32)
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
@@ -216,7 +216,7 @@ L(renorm2):
lsl x7, x7, x5 // (~dif + (v << 48)) << d
str w4, [x0, #RNG]
mvn x7, x7 // ~dif
- b.ge 9f
+ b.hs 9f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
@@ -274,6 +274,128 @@ function msac_decode_symbol_adapt16_neon, export=1
b L(renorm)
endfunc
+function msac_decode_hi_tok_neon, export=1
+ ld1 {v0.4h}, [x1] // cdf
+ add x16, x0, #RNG
+ movi v31.4h, #0x7f, lsl #8 // 0x7f00
+ movrel x17, coeffs, 30-2*3
+ mvni v30.4h, #0x3f // 0xffc0
+ ldrh w9, [x1, #6] // count = cdf[n_symbols]
+ ld1r {v3.4h}, [x16] // rng
+ movrel x16, bits
+ ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
+ add x17, x0, #DIF + 6
+ ld1 {v16.8h}, [x16]
+ mov w13, #-24
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+ ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+1:
+ and v7.8b, v3.8b, v31.8b // rng & 0x7f00
+ sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ str h3, [sp, #14] // store original u = s->rng
+ cmhs v2.8h, v1.8h, v4.8h // c >= v
+ str q4, [sp, #16] // store v values to allow indexed access
+ and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
+ addv h6, v6.8h // Aggregate mask bits
+ umov w3, v6.h[0]
+ add w13, w13, #5
+ rbit w3, w3
+ add x8, sp, #16
+ clz w15, w3 // ret
+
+ cbz w10, 2f
+ // update_cdf
+ movi v5.8b, #0xff
+ mov w4, #-5
+ urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
+ sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
+ sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6.4h, w4 // -rate
+
+ sub w9, w9, w9, lsr #5 // count - (count == 32)
+ sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
+ sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
+ add w9, w9, #1 // count + (count < 32)
+ add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
+ st1 {v0.4h}, [x1]
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ strh w9, [x1, #6]
+
+2:
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ dup v3.4h, w4
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ lsl w15, w15, #1
+ sub w15, w15, #5
+ lsr x12, x7, #48
+ adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
+ dup v1.8h, w12
+ b.cc 1b // loop if !carry
+ add w13, w13, #30
+ str w6, [x0, #CNT]
+ add sp, sp, #48
+ str x7, [x0, #DIF]
+ lsr w0, w13, #1
+ ret
+endfunc
+
function msac_decode_bool_equi_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S
index 3332c85223d..fc0e0d04f1c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S
@@ -170,6 +170,18 @@
trn2 \r3\().2s, \t5\().2s, \t7\().2s
.endm
+.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4s, \r0\().4s, \r1\().4s
+ trn2 \t5\().4s, \r0\().4s, \r1\().4s
+ trn1 \t6\().4s, \r2\().4s, \r3\().4s
+ trn2 \t7\().4s, \r2\().4s, \r3\().4s
+
+ trn1 \r0\().2d, \t4\().2d, \t6\().2d
+ trn2 \r2\().2d, \t4\().2d, \t6\().2d
+ trn1 \r1\().2d, \t5\().2d, \t7\().2d
+ trn2 \r3\().2d, \t5\().2d, \t7\().2d
+.endm
+
.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
index 6b1d46fcd81..1cd0955d4e9 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
@@ -93,6 +93,7 @@
.global EXTERN\name
#ifdef __ELF__
.type EXTERN\name, %function
+ .hidden EXTERN\name
#endif
#if HAVE_AS_FUNC
.func EXTERN\name
@@ -109,7 +110,7 @@ EXTERN\name:
\name:
.endm
-.macro const name, align=2
+.macro const name, export=0, align=2
.macro endconst
#ifdef __ELF__
.size \name, . - \name
@@ -124,6 +125,13 @@ EXTERN\name:
.const_data
#endif
.align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#endif
+EXTERN\name:
+ .endif
\name:
.endm
@@ -135,4 +143,9 @@ EXTERN\name:
#define X(x) CONCAT(EXTERN, x)
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+#endif
+
#endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c
index f9c68e9eb75..ad418f2db59 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c
@@ -29,32 +29,32 @@
#include "src/itx.h"
#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
#define decl_itx12_fns(w, h, opt) \
decl_itx2_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
#define decl_itx16_fns(w, h, opt) \
decl_itx12_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
#define decl_itx17_fns(w, h, opt) \
decl_itx16_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
decl_itx17_fns( 4, 4, neon);
decl_itx16_fns( 4, 8, neon);
@@ -71,16 +71,16 @@ decl_itx2_fns (32, 8, neon);
decl_itx2_fns (32, 16, neon);
decl_itx2_fns (32, 32, neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_neon);
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
-COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) {
+COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
- dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
#define assign_itx1_fn(pfx, w, h, ext) \
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
@@ -117,7 +117,9 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 && ARCH_AARCH64
+ if (bpc > 10) return;
+
+#if ARCH_AARCH64 || BITDEPTH == 8
assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/msac.h b/chromium/third_party/dav1d/libdav1d/src/arm/msac.h
index a243a06295d..9db0bf86aea 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/msac.h
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/msac.h
@@ -34,14 +34,16 @@ unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#if ARCH_AARCH64
+#if ARCH_AARCH64 || defined(__ARM_NEON)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
diff --git a/chromium/third_party/dav1d/libdav1d/src/decode.c b/chromium/third_party/dav1d/libdav1d/src/decode.c
index a5646c648e6..f6782153c14 100644
--- a/chromium/third_party/dav1d/libdav1d/src/decode.c
+++ b/chromium/third_party/dav1d/libdav1d/src/decode.c
@@ -3302,7 +3302,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
#define assign_bitdepth_case(bd) \
dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
- dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
+ dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
diff --git a/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm b/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm
index a6a8fb7c6b8..c252e5451be 100644
--- a/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm
@@ -358,7 +358,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
-%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
%assign %%pad 0
@@ -403,7 +403,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endif
%endmacro
-%macro SETUP_STACK_POINTER 1
+%macro SETUP_STACK_POINTER 0-1 0
%ifnum %1
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0
diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.c b/chromium/third_party/dav1d/libdav1d/src/getbits.c
index c185053bd98..7bb20140e41 100644
--- a/chromium/third_party/dav1d/libdav1d/src/getbits.c
+++ b/chromium/third_party/dav1d/libdav1d/src/getbits.c
@@ -27,6 +27,8 @@
#include "config.h"
+#include <limits.h>
+
#include "common/intops.h"
#include "src/getbits.h"
@@ -34,6 +36,8 @@
void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data,
const size_t sz)
{
+ // If sz were 0, c->eof would need to be initialized to 1.
+ assert(sz);
c->ptr = c->ptr_start = data;
c->ptr_end = &c->ptr_start[sz];
c->bits_left = 0;
@@ -77,25 +81,23 @@ int dav1d_get_sbits(GetBits *const c, const unsigned n) {
return res >> shift;
}
-unsigned dav1d_get_uleb128(GetBits *c) {
- unsigned val = 0, more, i = 0;
+unsigned dav1d_get_uleb128(GetBits *const c) {
+ uint64_t val = 0;
+ unsigned i = 0, more;
do {
- more = dav1d_get_bits(c, 1);
- unsigned bits = dav1d_get_bits(c, 7);
- if (i <= 3 || (i == 4 && bits < (1 << 4)))
- val |= bits << (i * 7);
- else if (bits) {
- c->error = 1;
- return 0;
- }
- if (more && ++i == 8) {
- c->error = 1;
- return 0;
- }
- } while (more);
+ const int v = dav1d_get_bits(c, 8);
+ more = v & 0x80;
+ val |= ((uint64_t) (v & 0x7F)) << i;
+ i += 7;
+ } while (more && i < 56);
+
+ if (val > UINT_MAX || more) {
+ c->error = 1;
+ return 0;
+ }
- return val;
+ return (unsigned) val;
}
unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
diff --git a/chromium/third_party/dav1d/libdav1d/src/itx.h b/chromium/third_party/dav1d/libdav1d/src/itx.h
index 3befc420994..a299629c5cd 100644
--- a/chromium/third_party/dav1d/libdav1d/src/itx.h
+++ b/chromium/third_party/dav1d/libdav1d/src/itx.h
@@ -43,8 +43,8 @@ typedef struct Dav1dInvTxfmDSPContext {
itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
} Dav1dInvTxfmDSPContext;
-bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c);
-bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c);
+bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc);
bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c);
#endif /* DAV1D_SRC_ITX_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
index 02f34e85c92..a0e807f9550 100644
--- a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
@@ -180,7 +180,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
dst[x] = iclip_pixel(dst[x] + *c++);
}
-COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
+COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#define assign_itx_all_fn64(w, h, pfx) \
c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
inv_txfm_add_dct_dct_##w##x##h##_c
@@ -224,8 +224,6 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
inv_txfm_add_identity_adst_##w##x##h##_c; \
- memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
-
c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
assign_itx_all_fn84( 4, 4, );
assign_itx_all_fn84( 4, 8, R);
@@ -249,7 +247,7 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_itx_dsp_init_arm)(c);
+ bitfn(dav1d_itx_dsp_init_arm)(c, bpc);
#endif
#if ARCH_X86
bitfn(dav1d_itx_dsp_init_x86)(c);
diff --git a/chromium/third_party/dav1d/libdav1d/src/log.c b/chromium/third_party/dav1d/libdav1d/src/log.c
index 999e3a2e8a0..de6776a617e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/log.c
+++ b/chromium/third_party/dav1d/libdav1d/src/log.c
@@ -36,13 +36,13 @@
#include "src/internal.h"
#include "src/log.h"
+#if CONFIG_LOG
COLD void dav1d_log_default_callback(void *const cookie,
const char *const format, va_list ap)
{
vfprintf(stderr, format, ap);
}
-#if CONFIG_LOG
COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
validate_input(c != NULL);
diff --git a/chromium/third_party/dav1d/libdav1d/src/log.h b/chromium/third_party/dav1d/libdav1d/src/log.h
index 8f6357cb660..df32de7f253 100644
--- a/chromium/third_party/dav1d/libdav1d/src/log.h
+++ b/chromium/third_party/dav1d/libdav1d/src/log.h
@@ -35,12 +35,12 @@
#include "common/attributes.h"
-void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
-
#if CONFIG_LOG
#define dav1d_log dav1d_log
+void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3);
#else
+#define dav1d_log_default_callback NULL
#define dav1d_log(...) do { } while(0)
#endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/meson.build b/chromium/third_party/dav1d/libdav1d/src/meson.build
index 1a7114a870e..fd8ad0269c1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/meson.build
+++ b/chromium/third_party/dav1d/libdav1d/src/meson.build
@@ -102,6 +102,8 @@ if is_asm_enabled
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/64/itx.S',
'arm/64/looprestoration_common.S',
'arm/64/msac.S',
)
@@ -110,7 +112,6 @@ if is_asm_enabled
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
- 'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
@@ -121,6 +122,7 @@ if is_asm_enabled
libdav1d_sources += files(
'arm/64/cdef16.S',
'arm/64/ipred16.S',
+ 'arm/64/itx16.S',
'arm/64/loopfilter16.S',
'arm/64/looprestoration16.S',
'arm/64/mc16.S',
@@ -128,12 +130,14 @@ if is_asm_enabled
endif
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
+ 'arm/32/msac.S',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
+ 'arm/32/itx.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
@@ -149,14 +153,9 @@ if is_asm_enabled
libdav1d_sources += files(
'x86/cpu.c',
+ 'x86/msac_init.c',
)
- if host_machine.cpu_family() == 'x86_64'
- libdav1d_sources += files(
- 'x86/msac_init.c',
- )
- endif
-
libdav1d_tmpl_sources += files(
'x86/cdef_init_tmpl.c',
'x86/film_grain_init_tmpl.c',
@@ -189,7 +188,7 @@ if is_asm_enabled
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
'x86/looprestoration_ssse3.asm',
- 'x86/mc_ssse3.asm',
+ 'x86/mc_sse.asm',
)
endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/msac.c b/chromium/third_party/dav1d/libdav1d/src/msac.c
index afd42543081..8195977d578 100644
--- a/chromium/third_party/dav1d/libdav1d/src/msac.c
+++ b/chromium/third_party/dav1d/libdav1d/src/msac.c
@@ -198,12 +198,11 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
s->rng = 0x8000;
s->cnt = -15;
s->allow_update_cdf = !disable_cdf_update_flag;
+ ctx_refill(s);
#if ARCH_X86_64 && HAVE_ASM
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
dav1d_msac_init_x86(s);
#endif
-
- ctx_refill(s);
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
index 457d9712497..8e96f8e16ad 100644
--- a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
@@ -1071,15 +1071,15 @@ static int warp_affine(Dav1dTileContext *const t,
const int height = (refp->p.p.h + ss_ver) >> ss_ver;
for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
// calculate transformation relative to center of 8x8 block in
// luma pixel units
const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
- const int src_y = t->by * 4 + ((y + 4) << ss_ver);
- const int64_t mvx = ((int64_t) mat[2] * src_x +
- (int64_t) mat[3] * src_y + mat[0]) >> ss_hor;
- const int64_t mvy = ((int64_t) mat[4] * src_x +
- (int64_t) mat[5] * src_y + mat[1]) >> ss_ver;
+ const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
const int dx = (int) (mvx >> 16) - 4;
const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
diff --git a/chromium/third_party/dav1d/libdav1d/src/refmvs.c b/chromium/third_party/dav1d/libdav1d/src/refmvs.c
index 2039bed4fe4..1e113b4eacf 100644
--- a/chromium/third_party/dav1d/libdav1d/src/refmvs.c
+++ b/chromium/third_party/dav1d/libdav1d/src/refmvs.c
@@ -182,10 +182,13 @@ static inline union mv mv_projection(const union mv mv, const int num, const int
};
assert(den > 0 && den < 32);
assert(num > -32 && num < 32);
- const int dm = div_mult[den];
- const int y = mv.y * num * dm, x = mv.x * num * dm;
- return (union mv) { .y = (y + 8192 + (y >> 31)) >> 14,
- .x = (x + 8192 + (x >> 31)) >> 14 };
+ const int frac = num * div_mult[den];
+ const int y = mv.y * frac, x = mv.x * frac;
+ // Round and clip according to AV1 spec section 7.9.3
+ return (union mv) { // 0x3fff == (1 << 14) - 1
+ .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
+ .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
+ };
}
static void add_temporal_candidate(const refmvs_frame *const rf,
diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.c b/chromium/third_party/dav1d/libdav1d/src/tables.c
index c0466193fa8..30d9fa6ae1a 100644
--- a/chromium/third_party/dav1d/libdav1d/src/tables.c
+++ b/chromium/third_party/dav1d/libdav1d/src/tables.c
@@ -442,7 +442,7 @@ const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
0
};
-const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
+const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = {
[DAV1D_FILTER_8TAP_REGULAR] = {
{ 0, 1, -3, 63, 4, -1, 0, 0 },
{ 0, 1, -5, 61, 9, -2, 0, 0 },
@@ -524,6 +524,27 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
{ 0, 0, 2, 20, 31, 11, 0, 0 },
{ 0, 0, 2, 18, 31, 13, 0, 0 },
{ 0, 0, 1, 17, 31, 15, 0, 0 }
+#if ARCH_X86_64
+ /* Bilin scaled being very rarely used, add a new table entry
+ * and use the put/prep_8tap_scaled code, thus acting as a
+ * scaled bilinear filter. */
+ }, [5] = {
+ { 0, 0, 0, 60, 4, 0, 0, 0 },
+ { 0, 0, 0, 56, 8, 0, 0, 0 },
+ { 0, 0, 0, 52, 12, 0, 0, 0 },
+ { 0, 0, 0, 48, 16, 0, 0, 0 },
+ { 0, 0, 0, 44, 20, 0, 0, 0 },
+ { 0, 0, 0, 40, 24, 0, 0, 0 },
+ { 0, 0, 0, 36, 28, 0, 0, 0 },
+ { 0, 0, 0, 32, 32, 0, 0, 0 },
+ { 0, 0, 0, 28, 36, 0, 0, 0 },
+ { 0, 0, 0, 24, 40, 0, 0, 0 },
+ { 0, 0, 0, 20, 44, 0, 0, 0 },
+ { 0, 0, 0, 16, 48, 0, 0, 0 },
+ { 0, 0, 0, 12, 52, 0, 0, 0 },
+ { 0, 0, 0, 8, 56, 0, 0, 0 },
+ { 0, 0, 0, 4, 60, 0, 0, 0 }
+#endif
}
};
diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.h b/chromium/third_party/dav1d/libdav1d/src/tables.h
index 8d2d8456cd9..abcf26592f0 100644
--- a/chromium/third_party/dav1d/libdav1d/src/tables.h
+++ b/chromium/third_party/dav1d/libdav1d/src/tables.h
@@ -110,7 +110,7 @@ extern const int8_t dav1d_cdef_directions[12][2];
extern const int16_t dav1d_sgr_params[16][4];
extern const uint8_t dav1d_sgr_x_by_x[256];
-extern const int8_t dav1d_mc_subpel_filters[5][15][8];
+extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
extern const int8_t dav1d_resize_filter[64][8];
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm
index 20ac75fff0a..643caa0cf99 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm
@@ -459,14 +459,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
movifnidn prid, prim
sub dampingd, 31
movifnidn secdmpd, secdmpm
- or prid, 0
+ test prid, prid
jz .sec_only
movd xm0, prid
lzcnt pridmpd, prid
add pridmpd, dampingd
cmovs pridmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
- or secdmpd, 0
+ test secdmpd, secdmpd
jz .pri_only
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
@@ -1468,14 +1468,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
movifnidn prid, prim
sub dampingd, 31
movifnidn secdmpd, secdmpm
- or prid, 0
+ test prid, prid
jz .border_sec_only
movd xm0, prid
lzcnt pridmpd, prid
add pridmpd, dampingd
cmovs pridmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
- or secdmpd, 0
+ test secdmpd, secdmpd
jz .border_pri_only
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm
index e73c09ed829..ad05b3b1fdc 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm
@@ -1412,7 +1412,6 @@ ALIGN function_align
mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
pcmpgtb m1, m2
pmovmskb r5d, m1
- popcnt r5d, r5d ; sets ZF which can be used by caller
ret
.w4_no_upsample:
%assign stack_offset org_stack_offset
@@ -1423,7 +1422,9 @@ ALIGN function_align
lea maxbased, [hq+3]
call .filter_strength
mov maxbased, 7
+ test r5d, r5d
jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
vpbroadcastd m7, [base+pb_8]
vbroadcasti128 m2, [tlq-1]
pminub m1, m7, [base+z_filter_s]
@@ -1596,7 +1597,9 @@ ALIGN function_align
test angled, 0x400
jnz .w8_no_intra_edge_filter
call .filter_strength
+ test r5d, r5d
jz .w8_main ; filter_strength == 0
+ popcnt r5d, r5d
movu xm2, [tlq]
pminub xm1, xm0, [base+z_filter_s+14]
vinserti128 m2, [tlq-1], 1
@@ -1698,7 +1701,9 @@ ALIGN function_align
test angled, 0x400
jnz .w16_no_intra_edge_filter
call .filter_strength
+ test r5d, r5d
jz .w16_main ; filter_strength == 0
+ popcnt r5d, r5d
vpbroadcastd m1, [base+pb_12]
vbroadcasti128 m6, [base+z_filter_s+8]
vinserti128 m2, m6, [base+z_filter_s], 0
@@ -2205,7 +2210,6 @@ ALIGN function_align
pand m0, m8, m7
pcmpgtb m0, m9
pmovmskb r3d, m0
- popcnt r3d, r3d
ret
ALIGN function_align
.upsample_above: ; w4/w8
@@ -2255,7 +2259,9 @@ ALIGN function_align
lea r3d, [hq+3]
sub angled, 1112 ; angle - 90
call .filter_strength
+ test r3d, r3d
jz .w4_no_filter_above
+ popcnt r3d, r3d
vpbroadcastd xm2, [base+pb_4]
pminub xm2, [base+z_filter_s]
vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2290,9 +2296,10 @@ ALIGN function_align
pand xm0, xm8 ; reuse from previous filter_strength call
pcmpgtb xm0, xm9
pmovmskb r3d, xm0
- popcnt r3d, r3d
.w4_filter_left:
+ test r3d, r3d
jz .w4_main
+ popcnt r3d, r3d
mov r5d, 10
cmp hd, 16
movu xm2, [rsp+49]
@@ -2443,7 +2450,9 @@ ALIGN function_align
lea r3d, [hq+7]
sub angled, 90 ; angle - 90
call .filter_strength
+ test r3d, r3d
jz .w8_no_filter_above
+ popcnt r3d, r3d
vpbroadcastd xm3, [base+pb_8]
pminub xm3, [base+z_filter_s+8]
vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2476,9 +2485,10 @@ ALIGN function_align
pand m0, m8
pcmpgtb m0, m9
pmovmskb r3d, m0
- popcnt r3d, r3d
.w8_filter_left:
+ test r3d, r3d
jz .w8_main
+ popcnt r3d, r3d
vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
@@ -2650,7 +2660,9 @@ ALIGN function_align
lea r3d, [hq+15]
sub angled, 90
call .filter_strength
+ test r3d, r3d
jz .w16_no_filter_above
+ popcnt r3d, r3d
vbroadcasti128 m6, [tlq+1]
mova xm2, [base+z_filter_s]
vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de
@@ -2683,8 +2695,9 @@ ALIGN function_align
pand m0, m8
pcmpgtb m0, m9
pmovmskb r3d, m0
- popcnt r3d, r3d
+ test r3d, r3d
jz .w16_main
+ popcnt r3d, r3d
vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
@@ -3086,7 +3099,6 @@ ALIGN function_align
mova xm2, [r4+angleq*8]
pcmpgtb m1, m2
pmovmskb r5d, m1
- popcnt r5d, r5d
ret
.h4_no_upsample:
%assign stack_offset org_stack_offset
@@ -3097,7 +3109,9 @@ ALIGN function_align
lea maxbased, [wq+3]
call .filter_strength
mov maxbased, 7
+ test r5d, r5d
jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
vpbroadcastd m7, [base+pb_7]
vbroadcasti128 m2, [tlq-14]
pmaxub m1, m7, [base+z_filter_s-4]
@@ -3288,7 +3302,9 @@ ALIGN function_align
test angled, 0x400
jnz .h8_no_intra_edge_filter
call .filter_strength
+ test r5d, r5d
jz .h8_main ; filter_strength == 0
+ popcnt r5d, r5d
vpbroadcastd xm6, [base+pb_15]
pcmpeqb xm1, xm1
psubusb xm6, xm0
@@ -3444,7 +3460,9 @@ ALIGN function_align
test angled, 0x400
jnz .h16_no_intra_edge_filter
call .filter_strength
+ test r5d, r5d
jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
vpbroadcastd m11, [base+pb_27]
vpbroadcastd m1, [base+pb_1]
vbroadcasti128 m6, [base+z_filter_s+12]
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm
index c78c1531dd2..f27b90032f3 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm
@@ -884,7 +884,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd m5, [o(pw_2896_2896)]
ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
vpbroadcastd m0, [o(pw_m2896_2896)]
- ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a
+ ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a
punpckhqdq m0, m8, m3 ; t15a t14
punpcklqdq m8, m3 ; t8a t9
shufps m5, m4, m2, q1032 ; t12 t13a
@@ -1170,7 +1170,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
paddsw m3, m8
jmp m(iadst_4x16_internal).end2
-%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
movq xm%3, [dstq ]
movhps xm%3, [dstq+%5]
movq xm%4, [dstq+%6]
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm
index 801c3d721fe..3e3c35c34af 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm
@@ -51,9 +51,12 @@ cextern sgr_x_by_x
SECTION .text
INIT_YMM avx2
-cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
+cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
+ mov edged, edgem
vpbroadcastb m15, [fhq+0]
+ movifnidn wd, wm
vpbroadcastb m14, [fhq+2]
+ mov hd, hm
vpbroadcastb m13, [fhq+4]
vpbroadcastw m12, [fhq+6]
vpbroadcastd m11, [pw_2048]
@@ -64,7 +67,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
; if (edge & has_right) align_w_to_32
; else w -= 32, and use that as limit in x loop
- test edged, 2 ; has_right
+ test edgeb, 2 ; has_right
jnz .align
mov xlimq, -3
jmp .loop
@@ -80,7 +83,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
lea xq, [wq+xlimq]
; load left edge pixels
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .emu_left
test leftq, leftq ; left == NULL for the edge-extended bottom/top
jz .load_left_combined
@@ -203,17 +206,19 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
jg .loop
RET
-cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
- vpbroadcastd m14, [fvq+4]
- vpbroadcastd m15, [fvq]
- vpbroadcastd m13, [pw_0_128]
- paddw m14, m13
+cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
+ movifnidn fvq, fvmp
+ mov edged, edgem
+ movifnidn hd, hm
+ vpbroadcastd m10, [fvq]
+ vpbroadcastd m11, [fvq+4]
+ vpbroadcastd m0, [pw_0_128]
vpbroadcastd m12, [pd_1024]
DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ paddw m11, m0
+ and ylimd, 2 ; have_bottom
sub ylimd, 3
; main x loop for vertical filter, does one column of 16 pixels
@@ -221,7 +226,7 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
mova m3, [midq] ; middle line
; load top pixels
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jz .emu_top
mova m0, [midq-384*4]
mova m2, [midq-384*2]
@@ -276,27 +281,28 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
; try to structure the loop so that the common case is evaluated fastest
mova m6, [mptrq+384*6]
.loop:
- paddw m7, m0, m6
- paddw m8, m1, m5
- paddw m9, m2, m4
- punpcklwd m10, m7, m8
- punpckhwd m7, m8
- punpcklwd m11, m9, m3
- punpckhwd m9, m3
- pmaddwd m10, m15
- pmaddwd m7, m15
- pmaddwd m11, m14
- pmaddwd m9, m14
- paddd m10, m11
+ paddw m0, m6
+ paddw m7, m1, m5
+ paddw m8, m2, m4
+ punpcklwd m9, m0, m7
+ punpckhwd m0, m7
+ punpcklwd m7, m8, m3
+ punpckhwd m8, m3
+ pmaddwd m9, m10
+ pmaddwd m0, m10
+ pmaddwd m7, m11
+ pmaddwd m8, m11
+ add mptrq, 384*2
paddd m7, m9
- paddd m10, m12
+ paddd m0, m8
paddd m7, m12
- psrad m10, 11
+ paddd m0, m12
psrad m7, 11
- packssdw m10, m7
- packuswb m10, m10
- vpermq m10, m10, q3120
- mova [dstptrq], xm10
+ psrad m0, 11
+ packssdw m7, m0
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ mova [dstptrq], xm7
; shift pixels one position
mova m0, m1
mova m1, m2
@@ -305,51 +311,51 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
mova m4, m5
mova m5, m6
add dstptrq, strideq
- add mptrq, 384*2
dec yd
jg .loop_load
; for the bottom pixels, continue using m6 (as extended edge)
cmp yd, ylimd
jg .loop
-
- add dstq, 16
add midq, 32
+ add dstq, 16
sub wd, 16
jg .loop_x
RET
INIT_YMM avx2
-cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov xlimd, edged
+cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov xlimd, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mov edged, xlimd
and xlimd, 2 ; have_right
- add wd, xlimd
- xor xlimd, 2 ; 2*!have_right
- jnz .no_right
- add wd, 15
+ jz .no_right
+ add wd, 2+15
and wd, ~15
.no_right:
+ lea r10, [pb_right_ext_mask+32]
+ xor xlimd, 2 ; 2*!have_right
pxor m1, m1
- lea srcq, [srcq+wq]
+ add srcq, wq
lea sumq, [sumq+wq*2-2]
lea sumsqq, [sumsqq+wq*4-4]
neg wq
- lea r10, [pb_right_ext_mask+32]
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
- pinsrw xm0, [leftq+2], 7
+ vpbroadcastw xm0, [leftq+2]
add leftq, 4
jmp .expand_x
.no_left:
vpbroadcastb xm0, [srcq+xq]
jmp .expand_x
.load_left_from_main:
- pinsrw xm0, [srcq+xq-2], 7
+ vpbroadcastw xm0, [srcq+xq-2]
.expand_x:
punpckhbw xm0, xm1
@@ -359,8 +365,8 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
pmovzxbw m2, [srcq+xq]
- punpcklbw m3, m1
movu m4, [r10+xq*2]
+ punpcklbw m3, m1
pand m2, m4
pandn m4, m3
por m2, m4
@@ -380,22 +386,21 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
punpcklwd m5, m3, m2
punpckhwd m6, m3, m2
paddw m3, m4
- punpcklwd m7, m4, m1
+ punpcklwd m0, m4, m1
punpckhwd m4, m1
pmaddwd m5, m5
pmaddwd m6, m6
- pmaddwd m7, m7
+ pmaddwd m0, m0
pmaddwd m4, m4
- paddd m5, m7
- paddd m6, m4
paddw m3, m2
+ paddd m5, m0
+ vextracti128 xm0, m2, 1
+ paddd m6, m4
movu [sumq+xq*2], m3
- movu [sumsqq+xq*4+ 0], xm5
- movu [sumsqq+xq*4+16], xm6
+ movu [sumsqq+xq*4+ 0], xm5
+ movu [sumsqq+xq*4+16], xm6
vextracti128 [sumsqq+xq*4+32], m5, 1
vextracti128 [sumsqq+xq*4+48], m6, 1
-
- vextracti128 xm0, m2, 1
add xq, 16
; if x <= -16 we can reload more pixels
@@ -418,25 +423,25 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
RET
INIT_YMM avx2
-cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
mov xq, -2
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ and ylimd, 2 ; have_bottom
sub ylimd, 2 ; -2 if have_bottom=0, else 0
.loop_x:
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+32]
+ movu m6, [sum_ptrq+(384+16)*2*1]
mova m2, m0
mova m3, m1
mova m4, m0
mova m5, m1
- movu m6, [sum_ptrq+(384+16)*2*1]
mova m7, m6
mova m8, m6
jmp .loop_y_noload
@@ -550,8 +555,10 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
RET
INIT_YMM avx2
-cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
vpbroadcastd m15, [pw_16]
xor xd, xd
.loop_x:
@@ -654,75 +661,83 @@ cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
RET
INIT_YMM avx2
-cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt
- movd xm0, wtd
- vpbroadcastw m0, xm0
- psllw m0, 4
+cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
+%ifidn wtd, wtm
+ shl wtd, 4
+ movd xm5, wtd
+ vpbroadcastw m5, xm5
+%else
+ vpbroadcastw m5, wtm
+ mov hd, hm
+ psllw m5, 4
+%endif
DEFINE_ARGS dst, stride, t, w, h, idx
.loop_y:
xor idxd, idxd
.loop_x:
- mova m1, [tq+idxq*2+ 0]
- mova m4, [tq+idxq*2+32]
+ mova m0, [tq+idxq*2+ 0]
+ mova m1, [tq+idxq*2+32]
pmovzxbw m2, [dstq+idxq+ 0]
- pmovzxbw m5, [dstq+idxq+16]
- psllw m3, m2, 4
- psllw m6, m5, 4
- psubw m1, m3
- psubw m4, m6
- pmulhrsw m1, m0
- pmulhrsw m4, m0
- paddw m1, m2
- paddw m4, m5
- packuswb m1, m4
- vpermq m1, m1, q3120
- mova [dstq+idxq], m1
+ pmovzxbw m3, [dstq+idxq+16]
+ psllw m4, m2, 4
+ psubw m0, m4
+ psllw m4, m3, 4
+ psubw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+idxq], m0
add idxd, 32
cmp idxd, wd
jl .loop_x
+ add tq, 384*2
add dstq, strideq
- add tq, 384 * 2
dec hd
jg .loop_y
RET
INIT_YMM avx2
-cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- test edged, 2 ; have_right
+cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ test edgeb, 2 ; have_right
jz .no_right
xor xlimd, xlimd
- add wd, 2
- add wd, 15
+ add wd, 2+15
and wd, ~15
jmp .right_done
.no_right:
mov xlimd, 3
sub wd, 1
.right_done:
+ lea r10, [pb_right_ext_mask+32]
pxor m1, m1
lea srcq, [srcq+wq+1]
lea sumq, [sumq+wq*2-2]
lea sumsqq, [sumsqq+wq*4-4]
neg wq
- lea r10, [pb_right_ext_mask+32]
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
- movd xm0, [leftq]
- pinsrd xm0, [srcq+xq-1], 1
- pslldq xm0, 11
+ vpbroadcastd xm2, [leftq]
+ movd xm0, [srcq+xq-1]
add leftq, 4
+ palignr xm0, xm2, 1
jmp .expand_x
.no_left:
vpbroadcastb xm0, [srcq+xq-1]
jmp .expand_x
.load_left_from_main:
- pinsrd xm0, [srcq+xq-4], 3
+ vpbroadcastd xm0, [srcq+xq-4]
.expand_x:
punpckhbw xm0, xm1
@@ -734,8 +749,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
pmovzxbw m2, [srcq+xq]
- punpcklbw m3, m1
movu m4, [r10+xq*2]
+ punpcklbw m3, m1
pand m2, m4
pandn m4, m3
por m2, m4
@@ -775,8 +790,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
paddd m7, m9
paddd m3, m5
movu [sumq+xq*2], m0
- movu [sumsqq+xq*4+ 0], xm7
- movu [sumsqq+xq*4+16], xm3
+ movu [sumsqq+xq*4+ 0], xm7
+ movu [sumsqq+xq*4+16], xm3
vextracti128 [sumsqq+xq*4+32], m7, 1
vextracti128 [sumsqq+xq*4+48], m3, 1
@@ -795,35 +810,35 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
cmp xd, xlimd
jl .right_extend
+ add srcq, strideq
add sumsqq, (384+16)*4
add sumq, (384+16)*2
- add srcq, strideq
dec hd
jg .loop_y
RET
INIT_YMM avx2
-cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
mov xq, -2
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ and ylimd, 2 ; have_bottom
sub ylimd, 3 ; -3 if have_bottom=0, else -1
.loop_x:
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+32]
+ movu m10, [sum_ptrq+(384+16)*2*1]
mova m2, m0
mova m3, m1
mova m4, m0
mova m5, m1
mova m6, m0
mova m7, m1
- movu m10, [sum_ptrq+(384+16)*2*1]
mova m11, m10
mova m12, m10
mova m13, m10
@@ -833,10 +848,10 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right]
movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right]
- mova m2, m0
- mova m3, m1
movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
movu m12, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m2, m0
+ mova m3, m1
mova m11, m10
.loop_y:
movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
@@ -967,8 +982,10 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
RET
INIT_YMM avx2
-cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
vpbroadcastd m9, [pw_5_6]
vpbroadcastd m12, [pw_256]
psrlw m11, m12, 1 ; pw_128
@@ -1084,8 +1101,10 @@ cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
RET
INIT_YMM avx2
-cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt
- vpbroadcastd m0, [wtq]
+cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movifnidn hd, hm
+ vpbroadcastd m0, wtm
vpbroadcastd m10, [pd_1024]
DEFINE_ARGS dst, stride, t1, t2, w, h, idx
.loop_y:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c
index a1b25a90c8c..b0201ce3db1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c
@@ -169,7 +169,7 @@ void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
- const int16_t wt[2]); \
+ const uint32_t wt); \
\
static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
@@ -194,7 +194,7 @@ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
- const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \
+ const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
} \
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm
index 35a4ea53b8d..aaaea7835b5 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm
@@ -188,13 +188,13 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp+ 0]
- %define edged dword [esp+12]
+ %define edgeb byte [esp+12]
%define xlimd dword [esp+16]
%endif
; if (edge & has_right) align_w_to_16
; else w -= 3, and use that as limit in x loop
- test edged, 2 ; has_right
+ test edgeb, 2 ; has_right
jnz .align
mov xlimd, -3
jmp .loop
@@ -221,7 +221,7 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
%endif
; load left edge pixels
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .emu_left
test leftq, leftq ; left == NULL for the edge-extended bottom/top
jz .load_left_combined
@@ -477,7 +477,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
DEFINE_ARGS dst, stride, mid, w, h, y, edge
%define mptrq midq
%define dstptrq dstq
- %define edged dword [esp]
+ %define edgeb byte [esp]
%endif
; main x loop for vertical filter, does one column of 16 pixels
@@ -485,7 +485,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
mova m3, [midq] ; middle line
; load top pixels
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jz .emu_top
mova m0, [midq-384*4]
mova m2, [midq-384*2]
@@ -604,8 +604,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
mova m3, m4
mova m4, m5
mova m5, m6
- add dstptrq, strideq
add mptrq, 384*2
+ add dstptrq, strideq
dec yd
jg .loop_load
; for the bottom pixels, continue using m6 (as extended edge)
@@ -616,8 +616,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
mov midq, [esp+8]
mov dstq, [esp+4]
%endif
- add dstq, 8
add midq, 16
+ add dstq, 8
sub wd, 8
jg .loop_x
RET
@@ -679,7 +679,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
%define wq r0m
%define xlimd r1m
%define hd hmp
- %define edged edgemp
+ %define edgeb byte edgem
mov r6, edgem
and r6, 2 ; have_right
@@ -706,7 +706,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
@@ -795,11 +795,13 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
movifnidn edged, edgem
%else
-cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
%define sumsq_baseq dword [esp+0]
%define sum_baseq dword [esp+4]
%define ylimd dword [esp+8]
%define m8 [esp+12]
+ mov edged, r4m
+ mov hd, r3m
%endif
mov xq, -2
%if ARCH_X86_64
@@ -812,7 +814,7 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
.loop_x:
mov sumsqq, sumsq_baseq
mov sumq, sum_baseq
- lea yd, [hd+ylimd+2]
+ lea yd, [hq+ylimq+2]
%else
mov yd, edged
and yd, 8 ; have_bottom
@@ -824,12 +826,12 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
.loop_x:
mov sumsqd, sumsq_baseq
mov sumd, sum_baseq
- lea yd, [hd+2]
+ lea yd, [hq+2]
add yd, ylimd
%endif
lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
lea sumq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsqq+(384+16)*4*1]
movu m1, [sumsqq+(384+16)*4*1+16]
@@ -1180,10 +1182,10 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
%endif
+ add srcq, strideq
add aq, (384+16)*4
add bq, (384+16)*2
add tq, 384*2
- add srcq, strideq
dec yd
jg .loop_y
add xd, 8
@@ -1237,7 +1239,7 @@ cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xli
mova m11, [pb_0_1]
%else
cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edged edgemp
+ %define edgeb byte edgem
%define wd xd
%define wq wd
%define wm r5m
@@ -1249,7 +1251,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
%define m11 [PIC_sym(pb_0_1)]
%endif
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .no_right
xor xlimd, xlimd
add wd, 2
@@ -1275,7 +1277,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
@@ -1401,9 +1403,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
cmp xd, xlimd
jl .right_extend
+ add srcq, strideq
add sumsqq, (384+16)*4
add sumq, (384+16)*2
- add srcq, strideq
dec hd
jg .loop_y
%if ARCH_X86_32
@@ -1434,7 +1436,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
lea yd, [hd+ylimd+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1520,7 +1522,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
lea yd, [ylimd+2]
add yd, hm
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- test dword edgem, 4 ; have_top
+ test byte edgem, 4 ; have_top
jnz .sumsq_load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1582,7 +1584,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
lea yd, [ylimd+2]
add yd, hm
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test dword edgem, 4 ; have_top
+ test byte edgem, 4 ; have_top
jnz .sum_load_top
movu m0, [sum_ptrq+(384+16)*2*1]
mova m1, m0
@@ -1882,7 +1884,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
movifnidn wd, wm
- mov wtq, wtmp
+ movd m0, wtm
%if ARCH_X86_64
movifnidn hd, hm
mova m10, [pd_1024]
@@ -1892,7 +1894,6 @@ cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
%define m10 [PIC_sym(pd_1024)]
%define m11 m7
%endif
- movd m0, [wtq]
pshufd m0, m0, 0
DEFINE_ARGS dst, stride, t1, t2, w, h, idx
%if ARCH_X86_32
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm
index 7ff0cac070a..5d769df8db4 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm
@@ -133,16 +133,23 @@ subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5,
subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
pb_8x0_8x8: times 8 db 0
times 8 db 8
+bdct_lb_dw: times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
ALIGN 32
-resize_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
resize_shuf: times 5 db 0
db 1, 2, 3, 4, 5, 6
times 5+8 db 7
@@ -154,8 +161,11 @@ wm_422_sign: dd 0x80808080, 0x7f7f7f7f
wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
ALIGN 4
+pb_0123: db 0, 1, 2, 3
+pb_4567: db 4, 5, 6, 7
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
+pw_32: times 2 dw 32
pw_34: times 2 dw 34
pw_258: times 2 dw 258
pw_512: times 2 dw 512
@@ -163,11 +173,14 @@ pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
-pd_2: dd 2
-pd_32: dd 32
-pd_63: dd 63
-pd_512: dd 512
-pd_32768: dd 32768
+pd_2: dd 2
+pd_32: dd 32
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+pd_0x3ff: dd 0x3ff
+pd_0x4000: dd 0x4000
+pq_0x40000000: dq 0x40000000
%define pb_m64 (wm_sign_avx512+4)
%define pb_64 (wm_sign_avx512+8)
@@ -230,27 +243,53 @@ cextern mc_warp_filter
%endrep
%endmacro
+%macro SCALED_JMP_TABLE 1-*
+ %xdefine %1_table (%%table - %2)
+ %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+ %rep %0 - 1
+ dw %%base %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_1024:
+ %xdefine %1_dy1_table (%%dy_1024 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy1_w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_2048:
+ %xdefine %1_dy2_table (%%dy_2048 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy2_w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
-BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
-BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
%if HAVE_AVX512ICL
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
@@ -1943,19 +1982,22 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2
+ mov t0d, FILTER_%3
+ mov t1d, FILTER_%4
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
+%endif
+%endmacro
+
%if WIN64
DECLARE_REG_TMP 4, 5
%else
DECLARE_REG_TMP 7, 8
%endif
-%macro PUT_8TAP_FN 3 ; type, type_h, type_v
-cglobal put_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
-%endif
-%endmacro
+
+%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN regular, REGULAR, REGULAR
PUT_8TAP_FN regular_sharp, REGULAR, SHARP
@@ -3873,6 +3915,1853 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
RET
%endmacro
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ lea base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+ vpbroadcastd m14, mxm
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+96]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ vpbroadcastd m10, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
+ vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m15, m7, 0xaa
+ vpblendd m0, m2, 0xc0 ; 0 1 4 5
+ vpblendd m1, m3, 0xc0 ; 2 3 6 7
+ pblendvb m15, m11, m8
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1 ; 4 5 6 7
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ pmaddwd xm8, xm4, xm11
+ paddd xm5, xm6
+ paddd xm7, xm8
+ paddd xm5, xm13
+ paddd xm5, xm7
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq], xm5, 0
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd xm15, xm0
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m15, xm15, 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pblendvb m15, m11, m0
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ pmulhrsw m7, m12 ; 0 1 4 5
+ pmulhrsw m8, m12 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm11, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm11 ; 67
+ mova [rsp+0x00], xm7
+ mova [rsp+0x10], xm8
+ mova [rsp+0x20], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm10, r6q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ pshufd xm7, xm10, q0000
+ pshufd xm8, xm10, q1111
+ pshufd xm9, xm10, q2222
+ pshufd xm10, xm10, q3333
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pmaddwd xm6, xm2, xm9
+ pmaddwd xm7, xm3, xm10
+ paddd xm4, xm5
+ paddd xm6, xm7
+ paddd xm4, xm13
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq], xm4
+ add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu xm4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x00]
+ mova [rsp+0x00], xm1
+ mova xm1, [rsp+0x10]
+ mova [rsp+0x10], xm2
+ mova xm2, [rsp+0x20]
+ mova [rsp+0x20], xm3
+ pshufb xm4, xm14
+ pmaddubsw xm4, xm15
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm12
+ punpcklwd xm3, xm11, xm4
+ mova xm11, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm5, [srcq+ssq*1]
+ movu m6, [rsp+0x10]
+ pshufb xm4, xm14
+ pshufb xm5, xm14
+ pmaddubsw xm4, xm15
+ pmaddubsw xm5, xm15
+ movu [rsp+0x00], m6
+ phaddw xm4, xm5
+ pmulhrsw xm4, xm12
+ punpcklwd xm9, xm11, xm4
+ mova [rsp+0x20], xm9
+ psrldq xm11, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm11
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+.w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+r4*8]
+ movq xm10, [base+subpel_filters+r6*8]
+ movhps xm15, [base+subpel_filters+r7*8]
+ movhps xm10, [base+subpel_filters+r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+rX*8]
+ psrld m14, 10
+ mova [rsp], xm14
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.w8_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w8_loop
+ test myd, 0x400
+ mov [rsp+16], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .w8_skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+16]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .w8_loop
+.w8_skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+16]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .w8_loop
+.w16:
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+72], t0d
+ mov [rsp+56], srcq
+ mov [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+48]
+ jz .ret
+ add qword [rsp+64], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+16]
+ vpbroadcastd m15, [rsp+72]
+ pxor m9, m9
+ mov srcq, [rsp+56]
+ mov r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+16], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ mova [rsp], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+52], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ pshufd xm8, xm10, q0000
+ pshufd xm9, xm10, q1111
+ pshufd xm11, xm10, q3333
+ pshufd xm10, xm10, q2222
+ vpblendd m0, m2, 0xc0
+ pshufb m1, m14
+ pshufb m0, m14
+ pmaddubsw m1, m15
+ pmaddubsw m0, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ mova xm3, xm0
+ mova xm0, xm2
+ paddd xm5, xm13
+ paddd xm6, xm7
+ pshufb xm1, xm14
+ pmaddubsw xm1, xm15
+ phaddw xm1, xm1
+ pmulhrsw xm1, xm12
+ palignr xm7, xm1, xm4, 12
+ punpcklwd xm2, xm7, xm1 ; 67 78
+ pmaddwd xm7, xm2, xm11
+ mova xm4, xm1
+ paddd xm5, xm6
+ paddd xm5, xm7
+ psrad xm5, rndshift
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ vpermq m8, m8, q3120
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r11d, xm15, 1
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ movu xm2, [srcq+ssq*0]
+ movu xm3, [srcq+ssq*2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 1
+ vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
+ vinserti128 m2, [srcq+ssq*1], 1
+ vinserti128 m3, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m4, [srcq+ssq*1], 1
+ add srcq, ss3q
+ vpblendd m15, m7, 0x30
+ punpcklqdq m15, m15
+ pblendvb m15, m11, m8
+ movq xm10, r4q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ vinserti128 m10, xm10, 1
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb xm5, xm14
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m2, m3
+ phaddw m4, m5
+ pmulhrsw m2, m12
+ pmulhrsw m4, m12
+ palignr m5, m4, m2, 4
+ pshufd m3, m4, q2121
+ punpcklwd m0, m2, m5 ; 01 12
+ punpckhwd m1, m2, m5 ; 23 34
+ punpcklwd m2, m4, m3 ; 45 56
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ vinserti128 m11, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ mova m0, m1
+ mova m1, m2
+ paddd m4, m13
+ paddd m5, m6
+ pshufb m11, m14
+ vpermq m11, m11, q3120
+ pmaddubsw m11, m15
+ phaddw m11, m11
+ pmulhrsw m11, m12
+ palignr m6, m11, m3, 12
+ punpcklwd m2, m6, m11 ; 67 78
+ mova m3, m11
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ pshuflw xm4, xm4, q3120
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+.dy1_w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ mov [rsp+32], r7d
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ movu [rsp], m10
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ lea myd, [t1+myq]
+ mov t1d, 64 << 24
+ cmovnz t1q, [base+subpel_filters+myq*8]
+ vbroadcasti128 m14, [base+wswap]
+ movq xm11, t1q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r7d, [rsp+32]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+.dy1_w8_loop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_w8_loop
+.dy1_w16:
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+76], t0d
+ mov [rsp+80], srcq
+ mov [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+72]
+ jz .ret
+ add qword [rsp+88], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+32]
+ vpbroadcastd m15, [rsp+76]
+ pxor m9, m9
+ mov srcq, [rsp+80]
+ mov r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+32], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+64], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ movu [rsp], m10
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vbroadcasti128 m14, [base+wswap]
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r4d, [rsp+64]
+ mov r7d, [rsp+68]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*1]
+ movhps xm0, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ movhps xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vpblendd m0, m2, 0x30
+ vpblendd m1, m4, 0xc0
+ vpblendd m0, m3, 0xc0
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 01 23
+ punpckhwd xm2, xm1 ; 23 45
+.dy2_w2_loop:
+ movq xm6, [srcq+ssq*0]
+ vpbroadcastq m7, [srcq+ssq*1]
+ movhps xm6, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm3, xm8
+ pmaddwd xm5, xm2, xm9
+ vpblendd m6, m7, 0x30
+ vpblendd m6, m1, 0xc0
+ pshufb m6, m14
+ pmaddubsw m6, m15
+ phaddw m6, m6
+ pmulhrsw m6, m12
+ palignr m0, m6, m0, 8
+ pshufd m2, m0, q3221
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 45 67
+ punpckhwd xm2, xm1 ; 67 89
+ pmaddwd xm6, xm3, xm10
+ pmaddwd xm7, xm2, xm11
+ paddd xm4, xm5
+ paddd xm4, xm13
+ paddd xm6, xm7
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+ packuswb xm4, xm4
+ pextrw [dstq+dsq*0], xm4, 0
+ pextrw [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*2]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm1, [srcq+ssq*1]
+ movu xm3, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vinserti128 m15, xm15, 1
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m2, [srcq+ssq*0], 1
+ vinserti128 m3, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pblendvb m15, m11, m8
+ pshufb xm0, xm14
+ pshufb m2, m14
+ pshufb xm1, xm14
+ pshufb m3, m14
+ pmaddubsw xm0, xm15
+ pmaddubsw m2, m15
+ pmaddubsw xm1, xm15
+ pmaddubsw m3, m15
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ punpcklwd xm2, xm0, xm1
+ punpckhwd m1, m0, m1 ; 23 45
+ vinserti128 m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+ movu xm6, [srcq+ssq*0]
+ movu xm7, [srcq+ssq*1]
+ vinserti128 m6, [srcq+ssq*2], 1
+ vinserti128 m7, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psrld m2, m6, 16
+ pslld m3, m7, 16
+ paddw m6, m2
+ paddw m7, m3
+ pblendw m6, m7, 0xaa ; 67 89
+ pmulhrsw m6, m12
+ paddd m4, m5
+ vpblendd m0, m1, m6, 0x0f
+ mova m1, m6
+ vpermq m0, m0, q1032 ; 45 67
+ pmaddwd m6, m0, m10
+ pmaddwd m7, m1, m11
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+.dy2_w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ mov [rsp], r7d
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ lea myd, [t1+myq]
+ mov t1d, 64 << 24
+ cmovnz t1q, [base+subpel_filters+myq*8]
+ movq xm11, t1q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r7d, [rsp]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m14, m11, q2222
+ pshufd m11, m11, q3333
+.dy2_w8_loop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m14
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_w8_loop
+.dy2_w16:
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+64], t0d
+ mov [rsp+48], srcq
+ mov [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+40]
+ jz .ret
+ add qword [rsp+56], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp]
+ vpbroadcastd m15, [rsp+64]
+ pxor m9, m9
+ mov srcq, [rsp+48]
+ mov r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+32], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movq xm14, r4q
+ punpcklbw xm14, xm14
+ psraw xm14, 8
+ vinserti128 m14, xm14, 1
+ mov r4d, [rsp+32]
+ mov r7d, [rsp+36]
+ pshufd m8, m14, q0000
+ pshufd m9, m14, q1111
+ pshufd m11, m14, q2222
+ pshufd m14, m14, q3333
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m14
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+MC_8TAP_SCALED prep
+
%macro WARP_V 5 ; dst, 02, 46, 13, 57
; Can be done using gathers, but that's terribly slow on many CPU:s
lea tmp1d, [myq+deltaq*4]
@@ -5010,7 +6899,7 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
vpbroadcastd m3, [base+pw_m256]
vpbroadcastd m7, [base+pd_63]
vbroadcasti128 m15, [base+pb_8x0_8x8]
- pmaddwd m2, m5, [base+resize_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
pslld m5, 3 ; dx*8
pslld m6, 14
paddd m8, m2 ; mx+[0..7]*dx
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c
index a21877c6671..a01ac14ab4a 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c
@@ -52,33 +52,65 @@ decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sse2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2);
decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sse2);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2);
decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_sse2);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2);
decl_mct_fn(dav1d_prep_bilin_avx512icl);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
+decl_mct_fn(dav1d_prep_bilin_sse2);
+
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
@@ -123,12 +155,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->mc[type] = dav1d_put_##name##_##suffix
#define init_mct_fn(type, name, suffix) \
c->mct[type] = dav1d_prep_##name##_##suffix
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = dav1d_put_##name##_##suffix
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = dav1d_prep_##name##_##suffix
+
const unsigned flags = dav1d_get_cpu_flags();
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
return;
#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
#endif
@@ -137,16 +185,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
return;
#if BITDEPTH == 8
- init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3);
- init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
- init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
@@ -187,16 +235,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
return;
#if BITDEPTH == 8
- init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
- init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
- init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
- init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
- init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
- init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
- init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
@@ -209,6 +257,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
c->avg = dav1d_avg_avx2;
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_ssse3.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_sse.asm
index 8386897d42b..d98ac621eb9 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_ssse3.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_sse.asm
@@ -66,6 +66,8 @@ resize_shuf: times 5 db 0
pb_64: times 16 db 64
pw_m256: times 8 dw -256
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_34: times 8 dw 34
@@ -117,6 +119,7 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
%endrep
%endmacro
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep)
%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
@@ -155,6 +158,8 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
%endif
%endmacro
+HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
@@ -738,15 +743,79 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
lea t0d, [hq+(7<<16)]
jmp .hv_w16gt
+%macro PSHUFB_0X1X 1-2 ; dst[, src]
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ punpcklbw %1, %1
+ psraw %1, 8
+ pshufd %1, %1, q0000
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ mova %2, %1
+ psrldq %1, 1
+ punpcklbw %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ mova %2, %1
+ psrldq %1, 1
+ punpckhbw %3, %1, %2
+ punpcklbw %1, %2
+ punpcklqdq %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2
+ %else
+ punpckhwd %3, %1, %4
+ punpcklwd %1, %4
+ pmaddwd %3, %2
+ pmaddwd %1, %2
+ psrad %3, %5
+ psrad %1, %5
+ packssdw %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+
DECLARE_REG_TMP 3, 5, 6
%if ARCH_X86_32
- %define base t2-prep_ssse3
+ %define base t2-prep%+SUFFIX
%else
%define base 0
%endif
+
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- LEA t2, prep_ssse3
+ LEA t2, prep%+SUFFIX
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -755,6 +824,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
test mxyd, mxyd
jnz .v
.prep:
+%if notcpuflag(ssse3)
+ add t2, prep_ssse3 - prep_sse2
+ jmp prep_ssse3
+%else
movzx wd, word [t2+wq*2+table_offset(prep,)]
add wq, t2
lea stride3q, [strideq*3]
@@ -824,10 +897,18 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
sub hd, 2
jg .prep_w16
RET
-.prep_w16gt:
+.prep_w32:
+ mov t2d, 1
+ jmp .prep_w32_vloop
+.prep_w64:
+ mov t2d, 2
+ jmp .prep_w32_vloop
+.prep_w128:
+ mov t2d, 4
+.prep_w32_vloop:
mov t1q, srcq
- mov r3q, t2q
-.prep_w16gt_hloop:
+ mov r3d, t2d
+.prep_w32_hloop:
movq m0, [t1q+8*0]
movq m1, [t1q+8*1]
movq m2, [t1q+8*2]
@@ -847,45 +928,49 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
mova [tmpq+16*3], m3
add tmpq, 16*4
add t1q, 32
- sub r3q, 1
- jg .prep_w16gt_hloop
+ dec r3d
+ jg .prep_w32_hloop
lea srcq, [srcq+strideq]
- sub hd, 1
- jg .prep_w16gt
+ dec hd
+ jg .prep_w32_vloop
RET
-.prep_w32:
- mov t2q, 1
- jmp .prep_w16gt
-.prep_w64:
- mov t2q, 2
- jmp .prep_w16gt
-.prep_w128:
- mov t2q, 4
- jmp .prep_w16gt
+%endif
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 0xff01
+%if cpuflag(ssse3)
mova m4, [base+bilin_h_shuf8]
+%endif
add mxyd, 16 << 8
- movd xm5, mxyd
+ movd m5, mxyd
mov mxyd, r6m ; my
+%if cpuflag(ssse3)
pshuflw m5, m5, q0000
punpcklqdq m5, m5
+%else
+ PSHUFB_0X1X m5
+%endif
test mxyd, mxyd
jnz .hv
%if ARCH_X86_32
mov t1, t2 ; save base reg for w4
%endif
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+ pxor m6, m6
+%endif
add wq, t2
lea stride3q, [strideq*3]
jmp wq
.h_w4:
-%if ARCH_X86_32
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
mova m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
+ %else
mova m4, [bilin_h_shuf4]
+ %endif
%endif
.h_w4_loop:
movq m0, [srcq+strideq*0]
@@ -893,10 +978,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movq m1, [srcq+strideq*2]
movhps m1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pmaddubsw m0, m5
- pshufb m1, m4
- pmaddubsw m1, m5
+ PSHUFB_BILIN_H4 m0, m4, m2
+ PMADDUBSW m0, m5, m6, m2, 0
+ PSHUFB_BILIN_H4 m1, m4, m2
+ PMADDUBSW m1, m5, m6, m2, 0
mova [tmpq+0 ], m0
mova [tmpq+16], m1
add tmpq, 32
@@ -909,14 +994,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movu m2, [srcq+strideq*2]
movu m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
@@ -931,14 +1016,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movu m2, [srcq+strideq*1+8*0]
movu m3, [srcq+strideq*1+8*1]
lea srcq, [srcq+strideq*2]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
@@ -947,52 +1032,60 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
sub hd, 2
jg .h_w16
RET
-.h_w16gt:
+.h_w32:
+ mov t2d, 1 << 0
+ jmp .h_w32_vloop
+.h_w64:
+ mov t2d, 1 << 1
+ jmp .h_w32_vloop
+.h_w128:
+ mov t2d, 1 << 3
+.h_w32_vloop:
mov t1q, srcq
- mov r3q, t2q
-.h_w16gt_hloop:
+ mov r3d, t2d
+.h_w32_hloop:
movu m0, [t1q+8*0]
movu m1, [t1q+8*1]
movu m2, [t1q+8*2]
movu m3, [t1q+8*3]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
add t1q, 32
- sub r3q, 1
- jg .h_w16gt_hloop
+ shr r3d, 1
+ jnz .h_w32_hloop
lea srcq, [srcq+strideq]
sub hd, 1
- jg .h_w16gt
+ jg .h_w32_vloop
RET
-.h_w32:
- mov t2q, 1
- jmp .h_w16gt
-.h_w64:
- mov t2q, 2
- jmp .h_w16gt
-.h_w128:
- mov t2q, 4
- jmp .h_w16gt
.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+%endif
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
imul mxyd, 0xff01
add mxyd, 16 << 8
add wq, t2
lea stride3q, [strideq*3]
movd m5, mxyd
+%if cpuflag(ssse3)
pshuflw m5, m5, q0000
punpcklqdq m5, m5
+%else
+ PSHUFB_0X1X m5
+ pxor m6, m6
+%endif
jmp wq
.v_w4:
movd m0, [srcq+strideq*0]
@@ -1004,14 +1097,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
punpcklwd m0, m1 ; 0 1 _ _
punpcklwd m1, m2 ; 1 2 _ _
punpcklbw m1, m0
- pmaddubsw m1, m5
+ PMADDUBSW m1, m5, m6, m7, 0
pshufd m1, m1, q3120
mova [tmpq+16*0], m1
movd m0, [srcq+strideq*0]
punpcklwd m2, m3 ; 2 3 _ _
punpcklwd m3, m0 ; 3 4 _ _
punpcklbw m3, m2
- pmaddubsw m3, m5
+ PMADDUBSW m3, m5, m6, m7, 0
pshufd m3, m3, q3120
mova [tmpq+16*1], m3
add tmpq, 32
@@ -1025,20 +1118,20 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movq m2, [srcq+strideq*1]
movq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2
+ shufpd m4, m0, m1, 0x0c ; 0 2
movq m0, [srcq+strideq*0]
- shufpd m2, m3, 0x0c ; 1 3
- shufpd m1, m0, 0x0c ; 2 4
+ shufpd m2, m3, 0x0c ; 1 3
+ shufpd m1, m0, 0x0c ; 2 4
punpcklbw m3, m2, m4
- pmaddubsw m3, m5
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*0], m3
punpckhbw m3, m2, m4
- pmaddubsw m3, m5
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*2], m3
punpcklbw m3, m1, m2
punpckhbw m1, m2
- pmaddubsw m3, m5
- pmaddubsw m1, m5
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
mova [tmpq+16*1], m3
mova [tmpq+16*3], m1
add tmpq, 16*4
@@ -1052,14 +1145,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movu m2, [srcq+strideq*2]
punpcklbw m3, m1, m0
punpckhbw m4, m1, m0
- pmaddubsw m3, m5
- pmaddubsw m4, m5
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*0], m3
mova [tmpq+16*1], m4
punpcklbw m3, m2, m1
punpckhbw m4, m2, m1
- pmaddubsw m3, m5
- pmaddubsw m4, m5
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*2], m3
mova [tmpq+16*3], m4
movu m3, [srcq+stride3q ]
@@ -1068,14 +1161,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
add tmpq, 16*8
punpcklbw m1, m3, m2
punpckhbw m4, m3, m2
- pmaddubsw m1, m5
- pmaddubsw m4, m5
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq-16*4], m1
mova [tmpq-16*3], m4
punpcklbw m1, m0, m3
punpckhbw m2, m0, m3
- pmaddubsw m1, m5
- pmaddubsw m2, m5
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
mova [tmpq-16*2], m1
mova [tmpq-16*1], m2
sub hd, 4
@@ -1084,6 +1177,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.v_w32:
lea t2d, [hq+(0<<16)]
mov t0d, 64
+ jmp .v_w32_start
+.v_w64:
+ lea t2d, [hq+(1<<16)]
+ mov t0d, 128
+ jmp .v_w32_start
+.v_w128:
+ lea t2d, [hq+(3<<16)]
+ mov t0d, 256
.v_w32_start:
%if ARCH_X86_64
%if WIN64
@@ -1092,43 +1193,43 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
mov r7, tmpq
%endif
mov t1, srcq
-.v_w32_loop_h:
- movu m0, [srcq+strideq*0+16*0] ; 0L
- movu m1, [srcq+strideq*0+16*1] ; 0U
-.v_w32_loop_v:
- movu m2, [srcq+strideq*1+16*0] ; 1L
- movu m3, [srcq+strideq*1+16*1] ; 1U
+.v_w32_hloop:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
punpcklbw m4, m2, m0
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*0], m4
punpckhbw m4, m2, m0
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*1], m4
punpcklbw m4, m3, m1
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*2], m4
punpckhbw m4, m3, m1
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*3], m4
add tmpq, t0q
- movu m0, [srcq+strideq*0+16*0] ; 2L
- movu m1, [srcq+strideq*0+16*1] ; 2U
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
punpcklbw m4, m0, m2
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*0], m4
punpckhbw m4, m0, m2
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*1], m4
punpcklbw m4, m1, m3
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*2], m4
punpckhbw m4, m1, m3
- pmaddubsw m4, m5
+ PMADDUBSW m4, m5, m6, m7, 0
mova [tmpq+16*3], m4
add tmpq, t0q
sub hd, 2
- jg .v_w32_loop_v
+ jg .v_w32_vloop
movzx hd, t2w
add t1, 32
mov srcq, t1
@@ -1141,62 +1242,78 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
mov tmpmp, tmpq
%endif
sub t2d, 1<<16
- jg .v_w32_loop_h
+ jg .v_w32_hloop
%if WIN64
POP r7
%endif
RET
-.v_w64:
- lea t2d, [hq+(1<<16)]
- mov t0d, 128
- jmp .v_w32_start
-.v_w128:
- lea t2d, [hq+(3<<16)]
- mov t0d, 256
- jmp .v_w32_start
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 8
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%else
+ WIN64_SPILL_XMM 10
+%endif
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+%if cpuflag(ssse3)
shl mxyd, 11
- movd xm6, mxyd
+%else
+ %if ARCH_X86_64
+ mova m8, [pw_8]
+ %else
+ %define m8 [pw_8]
+ %endif
+ pxor m7, m7
+%endif
+ movd m6, mxyd
add wq, t2
pshuflw m6, m6, q0000
+%if cpuflag(ssse3)
punpcklqdq m6, m6
+%else
+ %if ARCH_X86_64
+ psrlw m0, m8, 3
+ punpcklwd m6, m0
+ %else
+ punpcklwd m6, [base+pw_1]
+ %endif
+%endif
%if ARCH_X86_32
mov t1, t2 ; save base reg for w4
%endif
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
-%if ARCH_X86_32
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
mova m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
+ %else
mova m4, [bilin_h_shuf4]
+ %endif
%endif
- movq m0, [srcq+strideq*0] ; 0 _
- punpcklqdq m0, m0
- pshufb m0, m4
- pmaddubsw m0, m5
+ movhps m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m0, m4, m3
+ PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
.hv_w4_loop:
movq m1, [srcq+strideq*1]
- movhps m1, [srcq+strideq*2] ; 1 _ 2 _
+ movhps m1, [srcq+strideq*2]
movq m2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- movhps m2, [srcq+strideq*0] ; 3 _ 4 _
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5 ; 1 + 2 +
- shufpd m3, m0, m1, 0x01 ; 0 + 1 +
- pmaddubsw m0, m2, m5 ; 3 + 4 +
- shufpd m2, m1, m0, 0x01 ; 2 + 3 +
+ movhps m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m1, m4, m3
+ PSHUFB_BILIN_H4 m2, m4, m3
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
+ shufpd m3, m0, m1, 0x01 ; 0 1
+ mova m0, m2
+ PMADDUBSW m0, m5, m7, m4, 0 ; 3 4
+ shufpd m2, m1, m0, 0x01 ; 2 3
psubw m1, m3
- pmulhrsw m1, m6
+ PMULHRSW m1, m6, m4, m8, 4
paddw m1, m3
psubw m3, m0, m2
- pmulhrsw m3, m6
+ PMULHRSW m3, m6, m4, m8, 4
paddw m3, m2
mova [tmpq+16*0], m1
mova [tmpq+16*1], m3
@@ -1205,46 +1322,74 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
jg .hv_w4_loop
RET
.hv_w8:
- movu m0, [srcq+strideq*0]
- pshufb m0, m4
- pmaddubsw m0, m5 ; 0 +
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0
.hv_w8_loop:
- movu m1, [srcq+strideq*1] ; 1
- movu m2, [srcq+strideq*2] ; 2
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5 ; 1 +
- pmaddubsw m2, m5 ; 2 +
- psubw m3, m1, m0 ; 1-0
- pmulhrsw m3, m6
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1
+ PMADDUBSW m2, m5, m7, m4, 0 ; 2
+ psubw m3, m1, m0
+ PMULHRSW m3, m6, m4, m8, 4
paddw m3, m0
- psubw m7, m2, m1 ; 2-1
- pmulhrsw m7, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m9, m7
+%endif
+ psubw m7, m2, m1
+ PMULHRSW m7, m6, m4, m8, 4
paddw m7, m1
mova [tmpq+16*0], m3
mova [tmpq+16*1], m7
- movu m1, [srcq+stride3q ] ; 3
- lea srcq, [srcq+strideq*4]
- movu m0, [srcq+strideq*0] ; 4
- pshufb m1, m4
- pshufb m0, m4
- pmaddubsw m1, m5 ; 3 +
- pmaddubsw m0, m5 ; 4 +
- psubw m3, m1, m2 ; 3-2
- pmulhrsw m3, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m7, m9
+%endif
+ movu m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3
+ PMADDUBSW m0, m5, m7, m4, 0 ; 4
+ psubw m3, m1, m2
+ PMULHRSW m3, m6, m4, m8, 4
paddw m3, m2
- psubw m7, m0, m1 ; 4-3
- pmulhrsw m7, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m9, m7
+%endif
+ psubw m7, m0, m1
+ PMULHRSW m7, m6, m4, m8, 4
paddw m7, m1
mova [tmpq+16*2], m3
mova [tmpq+16*3], m7
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m7, m9
+ %else
+ pxor m7, m7
+ %endif
+%endif
add tmpq, 16*4
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
- lea t2d, [hq+(0<<16)]
+ mov t2d, hd
mov t0d, 32
+ jmp .hv_w16_start
+.hv_w32:
+ lea t2d, [hq+(1<<16)]
+ mov t0d, 64
+ jmp .hv_w16_start
+.hv_w64:
+ lea t2d, [hq+(3<<16)]
+ mov t0d, 128
+ jmp .hv_w16_start
+.hv_w128:
+ lea t2d, [hq+(7<<16)]
+ mov t0d, 256
.hv_w16_start:
%if ARCH_X86_64
%if WIN64
@@ -1253,47 +1398,47 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
mov r7, tmpq
%endif
mov t1, srcq
-.hv_w16_loop_h:
- movu m0, [srcq+strideq*0+8*0] ; 0L
- movu m1, [srcq+strideq*0+8*1] ; 0U
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5 ; 0L +
- pmaddubsw m1, m5 ; 0U +
-.hv_w16_loop_v:
- movu m2, [srcq+strideq*1+8*0] ; 1L
- pshufb m2, m4
- pmaddubsw m2, m5 ; 1L +
- psubw m3, m2, m0 ; 1L-0L
- pmulhrsw m3, m6
+.hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0a
+ PMADDUBSW m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+ movu m2, [srcq+strideq*1+8*0]
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m2, m5, m7, m4, 0 ; 1a
+ psubw m3, m2, m0
+ PMULHRSW m3, m6, m4, m8, 4
paddw m3, m0
mova [tmpq+16*0], m3
- movu m3, [srcq+strideq*1+8*1] ; 1U
- lea srcq, [srcq+strideq*2]
- pshufb m3, m4
- pmaddubsw m3, m5 ; 1U +
- psubw m0, m3, m1 ; 1U-0U
- pmulhrsw m0, m6
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m3, m5, m7, m4, 0 ; 1b
+ psubw m0, m3, m1
+ PMULHRSW m0, m6, m4, m8, 4
paddw m0, m1
mova [tmpq+16*1], m0
add tmpq, t0q
- movu m0, [srcq+strideq*0+8*0] ; 2L
- pshufb m0, m4
- pmaddubsw m0, m5 ; 2L +
- psubw m1, m0, m2 ; 2L-1L
- pmulhrsw m1, m6
+ movu m0, [srcq+strideq*0+8*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 2a
+ psubw m1, m0, m2
+ PMULHRSW m1, m6, m4, m8, 4
paddw m1, m2
mova [tmpq+16*0], m1
- movu m1, [srcq+strideq*0+8*1] ; 2U
- pshufb m1, m4
- pmaddubsw m1, m5 ; 2U +
- psubw m2, m1, m3 ; 2U-1U
- pmulhrsw m2, m6
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 2b
+ psubw m2, m1, m3
+ PMULHRSW m2, m6, m4, m8, 4
paddw m2, m3
mova [tmpq+16*1], m2
add tmpq, t0q
sub hd, 2
- jg .hv_w16_loop_v
+ jg .hv_w16_vloop
movzx hd, t2w
add t1, 16
mov srcq, t1
@@ -1306,23 +1451,12 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
mov tmpmp, tmpq
%endif
sub t2d, 1<<16
- jg .hv_w16_loop_h
+ jg .hv_w16_hloop
%if WIN64
POP r7
%endif
RET
-.hv_w32:
- lea t2d, [hq+(1<<16)]
- mov t0d, 64
- jmp .hv_w16_start
-.hv_w64:
- lea t2d, [hq+(3<<16)]
- mov t0d, 128
- jmp .hv_w16_start
-.hv_w128:
- lea t2d, [hq+(7<<16)]
- mov t0d, 256
- jmp .hv_w16_start
+%endmacro
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -2439,13 +2573,198 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .hv_w8_loop0
RET
-%if ARCH_X86_32
-DECLARE_REG_TMP 1, 2
-%elif WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ %if %5 == 1
+ pcmpeqd %2, %2
+ psrlq %2, 32
+ %endif
+ psrldq %3, %1, 1
+ pshufd %3, %3, q2301
+ pand %1, %2
+ pandn %4, %2, %3
+ por %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+ psrlq %1, %2, 16
+ %elifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %if %0 == 4
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ %else
+ %define %%tmp %5
+ %endif
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+ phaddw %1, %2
+ %else
+ %ifnidn %1, %2
+ %if %4 == 1
+ mova %3, [pw_1]
+ %endif
+ pmaddwd %1, %3
+ pmaddwd %2, %3
+ packssdw %1, %2
+ %else
+ %if %4 == 1
+ pmaddwd %1, [pw_1]
+ %else
+ pmaddwd %1, %3
+ %endif
+ packssdw %1, %1
+ %endif
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2, %3
+ %else
+ paddw %1, %2, %3
+ psraw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+ PMULHRSW_POW2 %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+ movd %1, [%2+0]
+ movd %3, [%2+1]
+ movd %4, [%2+2]
+ movd %5, [%2+3]
+ punpckldq %1, %3
+ punpckldq %4, %5
+ punpcklqdq %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+ movu m%1, [%2]
+ pshufb m2, m%1, m11 ; subpel_h_shufB
+ pshufb m3, m%1, m9 ; subpel_h_shufC
+ pshufb m%1, m10 ; subpel_h_shufA
+ %else
+ %if ARCH_X86_64
+ SWAP m12, m5
+ SWAP m13, m6
+ SWAP m14, m7
+ %define %%mx0 m%+%%i
+ %define %%mx1 m%+%%j
+ %assign %%i 0
+ %rep 12
+ movd %%mx0, [%2+%%i]
+ %assign %%i %%i+1
+ %endrep
+ %assign %%i 0
+ %rep 6
+ %assign %%j %%i+1
+ punpckldq %%mx0, %%mx1
+ %assign %%i %%i+2
+ %endrep
+ %assign %%i 0
+ %rep 3
+ %assign %%j %%i+2
+ punpcklqdq %%mx0, %%mx1
+ %assign %%i %%i+4
+ %endrep
+ SWAP m%1, m0
+ SWAP m2, m4
+ SWAP m3, m8
+ SWAP m5, m12
+ SWAP m6, m13
+ SWAP m7, m14
+ %else
+ PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7
+ SWAP m%1, m0
+ %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+ PREP_8TAP_H_LOAD %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m8, m1
+ SWAP m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+ mova m4, m2
+ PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0
+ PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4
+ PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4
+ PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m1, m8
+ SWAP m7, m9
+ %endif
+ paddw m3, m4
+ paddw m%1, m2
+ PHADDW m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+ PMULHRSW_8192 m%1, m%1, m7
+ %else
+ PMULHRSW_8192 m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+ movu %1, [%2]
+ pshufb m2, %1, shufB
+ pshufb m3, %1, shufC
+ pshufb %1, shufA
+ %else
+ PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+ PREP_8TAP_HV_LOAD %{1:4}
+ mova m1, m2
+ PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+ PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+ PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4
+ PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0
+ paddw m1, m3 ; C0+B4
+ paddw %1, m2 ; A0+C4
+ PHADDW %1, m1, %3, 1
+%endmacro
+
%macro PREP_8TAP_FN 3 ; type, type_h, type_v
cglobal prep_8tap_%1
mov t0d, FILTER_%2
@@ -2455,6 +2774,14 @@ cglobal prep_8tap_%1
%endif
%endmacro
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
PREP_8TAP_FN regular, REGULAR, REGULAR
PREP_8TAP_FN regular_sharp, REGULAR, SHARP
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
@@ -2467,14 +2794,13 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
%if ARCH_X86_32
%define base_reg r2
- %define base base_reg-prep_ssse3
+ %define base base_reg-prep%+SUFFIX
%define W32_RESTORE_SSQ mov strideq, stridem
%else
%define base_reg r7
%define base 0
%define W32_RESTORE_SSQ
%endif
-
cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
@@ -2484,13 +2810,13 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
movsxd wq, wm
movifnidn srcd, srcm
movifnidn hd, hm
- LEA base_reg, prep_ssse3
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
+ LEA base_reg, prep_ssse3
tzcnt wd, wd
- movzx wd, word [base_reg+wq*2+table_offset(prep,)]
+ movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
@@ -2501,25 +2827,49 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%endif
jmp wq
.h:
+ LEA base_reg, prep%+SUFFIX
test myd, 0xf00
jnz .hv
+%if cpuflag(ssse3)
WIN64_SPILL_XMM 12
+%else
+ WIN64_SPILL_XMM 16
+%endif
cmp wd, 4
je .h_w4
tzcnt wd, wd
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
mova m10, [base+subpel_h_shufA]
mova m11, [base+subpel_h_shufB]
mova m9, [base+subpel_h_shufC]
+ %else
+ %define m10 [base+subpel_h_shufA]
+ %define m11 [base+subpel_h_shufB]
+ %define m9 [base+subpel_h_shufC]
+ %endif
%endif
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+ movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+ movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
pshufd m6, m6, q0000
+%if cpuflag(ssse3)
mova m7, [base+pw_8192]
+%else
+ punpcklbw m5, m5
+ punpcklbw m6, m6
+ psraw m5, 8
+ psraw m6, 8
+ %if ARCH_X86_64
+ mova m7, [pw_2]
+ mova m15, [pw_1]
+ %else
+ %define m15 m4
+ %endif
+%endif
add wq, base_reg
jmp wq
.h_w4:
@@ -2529,39 +2879,115 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
movzx mxd, mxb
%endif
dec srcq
- movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+ movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
pshufd m4, m4, q0000
+%if cpuflag(ssse3)
mova m6, [base+pw_8192]
mova m5, [base+subpel_h_shufA]
+%else
+ mova m6, [base+pw_2]
+ %if ARCH_X86_64
+ mova m14, [pw_1]
+ %else
+ %define m14 m7
+ %endif
+ punpcklbw m4, m4
+ psraw m4, 8
+%endif
W32_RESTORE_SSQ
%if ARCH_X86_64
lea stride3q, [strideq*3]
%endif
.h_w4_loop:
+%if cpuflag(ssse3)
movq m0, [srcq+strideq*0] ; 0
movq m1, [srcq+strideq*1] ; 1
-%if ARCH_X86_32
+ %if ARCH_X86_32
lea srcq, [srcq+strideq*2]
movq m2, [srcq+strideq*0] ; 2
movq m3, [srcq+strideq*1] ; 3
lea srcq, [srcq+strideq*2]
-%else
+ %else
movq m2, [srcq+strideq*2] ; 2
movq m3, [srcq+stride3q ] ; 3
lea srcq, [srcq+strideq*4]
-%endif
- pshufb m0, m5 ; subpel_h_shufA
+ %endif
+ pshufb m0, m5
pshufb m1, m5
pshufb m2, m5
pshufb m3, m5
- pmaddubsw m0, m4 ; subpel_filters + 2
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmaddubsw m3, m4
- phaddw m0, m1
- phaddw m2, m3
- pmulhrsw m0, m6 ; pw_8192
- pmulhrsw m2, m6 ; pw_8192
+%else
+ %if ARCH_X86_64
+ movd m0, [srcq+strideq*0+0]
+ movd m12, [srcq+strideq*0+1]
+ movd m1, [srcq+strideq*1+0]
+ movd m5, [srcq+strideq*1+1]
+ movd m2, [srcq+strideq*2+0]
+ movd m13, [srcq+strideq*2+1]
+ movd m3, [srcq+stride3q +0]
+ movd m7, [srcq+stride3q +1]
+ punpckldq m0, m12
+ punpckldq m1, m5
+ punpckldq m2, m13
+ punpckldq m3, m7
+ movd m12, [srcq+strideq*0+2]
+ movd m8, [srcq+strideq*0+3]
+ movd m5, [srcq+strideq*1+2]
+ movd m9, [srcq+strideq*1+3]
+ movd m13, [srcq+strideq*2+2]
+ movd m10, [srcq+strideq*2+3]
+ movd m7, [srcq+stride3q +2]
+ movd m11, [srcq+stride3q +3]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m12, m8
+ punpckldq m5, m9
+ punpckldq m13, m10
+ punpckldq m7, m11
+ punpcklqdq m0, m12 ; 0
+ punpcklqdq m1, m5 ; 1
+ punpcklqdq m2, m13 ; 2
+ punpcklqdq m3, m7 ; 3
+ %else
+ movd m0, [srcq+strideq*0+0]
+ movd m1, [srcq+strideq*0+1]
+ movd m2, [srcq+strideq*0+2]
+ movd m3, [srcq+strideq*0+3]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklqdq m0, m2 ; 0
+ movd m1, [srcq+strideq*1+0]
+ movd m2, [srcq+strideq*1+1]
+ movd m3, [srcq+strideq*1+2]
+ movd m7, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m1, m2
+ punpckldq m3, m7
+ punpcklqdq m1, m3 ; 1
+ movd m2, [srcq+strideq*0+0]
+ movd m3, [srcq+strideq*0+1]
+ movd m7, [srcq+strideq*0+2]
+ movd m5, [srcq+strideq*0+3]
+ punpckldq m2, m3
+ punpckldq m7, m5
+ punpcklqdq m2, m7 ; 2
+ movd m3, [srcq+strideq*1+0]
+ movd m7, [srcq+strideq*1+1]
+ punpckldq m3, m7
+ movd m7, [srcq+strideq*1+2]
+ movd m5, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m7, m5
+ punpcklqdq m3, m7 ; 3
+ %endif
+%endif
+ PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
+ PMADDUBSW m1, m4, m5, m7, 0
+ PMADDUBSW m2, m4, m5, m7, 0
+ PMADDUBSW m3, m4, m5, m7, 0
+ PHADDW m0, m1, m14, ARCH_X86_32
+ PHADDW m2, m3, m14, 0
+ PMULHRSW_8192 m0, m0, m6
+ PMULHRSW_8192 m2, m2, m6
mova [tmpq+16*0], m0
mova [tmpq+16*1], m2
add tmpq, 32
@@ -2569,55 +2995,41 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
jg .h_w4_loop
RET
;
-%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
-%if ARCH_X86_32
- pshufb %2, %1, [base+subpel_h_shufB]
- pshufb %3, %1, [base+subpel_h_shufC]
- pshufb %1, [base+subpel_h_shufA]
-%else
- pshufb %2, %1, m11; subpel_h_shufB
- pshufb %3, %1, m9 ; subpel_h_shufC
- pshufb %1, m10 ; subpel_h_shufA
-%endif
- pmaddubsw %4, %2, m5 ; subpel +0 B0
- pmaddubsw %2, m6 ; subpel +4 B4
- pmaddubsw %3, m6 ; subpel +4 C4
- pmaddubsw %1, m5 ; subpel +0 A0
- paddw %3, %4
- paddw %1, %2
- phaddw %1, %3
- pmulhrsw %1, m7 ; 8192
-%endmacro
- ;
.h_w8:
%if ARCH_X86_32
mov r3, r2
- %define base_reg r3
+ %define base_reg r3
W32_RESTORE_SSQ
%endif
.h_w8_loop:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H m0, m2, m3, m4
- PREP_8TAP_H m1, m2, m3, m4
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+strideq*0
+ PREP_8TAP_H 1, srcq+strideq*1
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
+ lea srcq, [srcq+strideq*2]
add tmpq, 32
sub hd, 2
+%else
+ PREP_8TAP_H 0, srcq
+ mova [tmpq], m0
+ add srcq, strideq
+ add tmpq, 16
+ dec hd
+%endif
jg .h_w8_loop
RET
.h_w16:
- xor r6d, r6d
+ mov r6, -16*1
jmp .h_start
.h_w32:
- mov r6, -16*1
+ mov r6, -16*2
jmp .h_start
.h_w64:
- mov r6, -16*3
+ mov r6, -16*4
jmp .h_start
.h_w128:
- mov r6, -16*7
+ mov r6, -16*8
.h_start:
%if ARCH_X86_32
mov r3, r2
@@ -2627,15 +3039,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mov r5, r6
W32_RESTORE_SSQ
.h_loop:
- movu m0, [srcq+r6+8*0]
- movu m1, [srcq+r6+8*1]
- PREP_8TAP_H m0, m2, m3, m4
- PREP_8TAP_H m1, m2, m3, m4
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+r6+8*0
+ PREP_8TAP_H 1, srcq+r6+8*1
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 32
add r6, 16
- jle .h_loop
+%else
+ PREP_8TAP_H 0, srcq+r6
+ mova [tmpq], m0
+ add tmpq, 16
+ add r6, 8
+%endif
+ jl .h_loop
add srcq, strideq
mov r6, r5
dec hd
@@ -2644,8 +3061,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%if ARCH_X86_32
%define base_reg r2
%endif
-
+ ;
.v:
+ LEA base_reg, prep%+SUFFIX
%if ARCH_X86_32
mov mxd, myd
and mxd, 0x7f
@@ -2657,30 +3075,40 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
mova m2, [base+pw_512]
psrlw m2, m2, 1 ; 0x0100
mova m7, [base+pw_8192]
+%endif
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
%define subpel1 [rsp+mmsize*1]
%define subpel2 [rsp+mmsize*2]
%define subpel3 [rsp+mmsize*3]
%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
ALLOC_STACK -mmsize*4
+ %else
+ ALLOC_STACK -mmsize*5
+ %endif
%assign regs_used 7
movd m0, [myq+0]
- pshufb m0, m2
+ PSHUFB_0X1X m0, m2
mova subpel0, m0
movd m0, [myq+2]
- pshufb m0, m2
+ PSHUFB_0X1X m0, m2
mova subpel1, m0
movd m0, [myq+4]
- pshufb m0, m2
+ PSHUFB_0X1X m0, m2
mova subpel2, m0
movd m0, [myq+6]
- pshufb m0, m2
+ PSHUFB_0X1X m0, m2
mova subpel3, m0
+ %if notcpuflag(ssse3)
+ mov r6, base_reg
+ %define base_reg r6
+ %endif
mov strideq, [rstk+stack_offset+gprsize*3]
lea strideq, [strideq*3]
sub [rstk+stack_offset+gprsize*2], strideq
@@ -2692,25 +3120,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpel2 m10
%define subpel3 m11
movd subpel0, [myq+0]
- pshufb subpel0, m2
+ PSHUFB_0X1X subpel0, m2
movd subpel1, [myq+2]
- pshufb subpel1, m2
+ PSHUFB_0X1X subpel1, m2
movd subpel2, [myq+4]
- pshufb subpel2, m2
+ PSHUFB_0X1X subpel2, m2
movd subpel3, [myq+6]
- pshufb subpel3, m2
+ PSHUFB_0X1X subpel3, m2
lea stride3q, [strideq*3]
sub srcq, stride3q
cmp wd, 8
- jg .v_w16
- je .v_w8
+ jns .v_w8
%endif
.v_w4:
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define srcm [rsp+mmsize*4+gprsize*1]
- %define tmpm [rsp+mmsize*4+gprsize*2]
+%if notcpuflag(ssse3)
+ pxor m6, m6
+ %if ARCH_X86_64
+ mova m7, [base+pw_2]
+ %endif
%endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+ %define srcm [esp+stack_size+gprsize*1]
+ %define tmpm [esp+stack_size+gprsize*2]
+ %endif
mov tmpm, tmpq
mov srcm, srcq
lea r5d, [wq - 4] ; horizontal loop
@@ -2743,17 +3176,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%endif
punpckldq m3, m1 ; 4 5 _ _
punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
+ PALIGNR m4, m3, m2, 4 ; 1 2 3 4
punpcklbw m3, m1 ; 45 56
punpcklbw m1, m2, m4 ; 01 12
punpckhbw m2, m4 ; 23 34
.v_w4_loop:
- pmaddubsw m5, m1, subpel0 ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel0
+ %define subpel0 m7
+%endif
+ mova m5, m1
+ PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel1
+ %define subpel1 m7
+%endif
mova m1, m2
- pmaddubsw m2, subpel1 ; a1 b1
+ PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1
paddw m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel2
+ %define subpel2 m7
+%endif
mova m2, m3
- pmaddubsw m3, subpel2 ; a2 b2
+ PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
paddw m5, m3
movd m4, [srcq+strideq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
@@ -2761,9 +3207,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
lea srcq, [srcq+strideq*2]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
- pmaddubsw m4, m3, subpel3 ; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m12, m0
+ %else
+ mova [esp+mmsize*4], m0
+ mova m7, subpel3
+ %define subpel3 m7
+ %endif
+%endif
+ mova m4, m3
+ PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3
paddw m5, m4
- pmulhrsw m5, m7
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+ SWAP m0, m12
+ %endif
+ PMULHRSW_8192 m5, m5, m7
+%else
+ mova m0, [esp+mmsize*4]
+ PMULHRSW_8192 m5, m5, [base+pw_2]
+%endif
movq [tmpq+wq*0], m5
movhps [tmpq+wq*2], m5
lea tmpq, [tmpq+wq*4]
@@ -2781,26 +3245,28 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w4_loop0
%endif
RET
-
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ %define base_reg r2
+%endif
+ ;
%if ARCH_X86_64
.v_w8:
-.v_w16:
lea r5d, [wq - 8] ; horizontal loop
mov r8, tmpq
mov r6, srcq
shl r5d, 8 - 3; (wq / 8) << 8
mov r5b, hb
.v_w8_loop0:
- movq m4, [srcq+strideq*0] ; 0
- movq m5, [srcq+strideq*1] ; 1
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movq m6, [srcq+strideq*0] ; 2
- movq m0, [srcq+strideq*1] ; 3
+ movq m6, [srcq+strideq*0]
+ movq m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movq m1, [srcq+strideq*0]
+ movq m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movq m1, [srcq+strideq*0] ; 4
- movq m2, [srcq+strideq*1] ; 5
- lea srcq, [srcq+strideq*2] ;
- movq m3, [srcq+strideq*0] ; 6
+ movq m3, [srcq+strideq*0]
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
@@ -2812,9 +3278,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w8_loop:
- movq m12, [srcq+strideq*1] ; 8
+%if cpuflag(ssse3)
+ movq m12, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0] ; 9
+ movq m13, [srcq+strideq*0]
pmaddubsw m14, m1, subpel0 ; a0
pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
@@ -2839,8 +3306,43 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
- movu [tmpq+wq*0], xm14
- movu [tmpq+wq*2], xm15
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
+%else
+ mova m14, m1
+ PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m1, m3
+ PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ paddw m14, m3
+ mova m3, m5
+ PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
+ paddw m14, m5
+ movq m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movq m13, [srcq+strideq*0]
+ shufpd m15, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m15, m0 ; 67
+ punpckhbw m15, m0 ; 78
+ mova m13, m5
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; a3
+ paddw m14, m13
+ PMULHRSW_8192 m14, m14, [base+pw_2]
+ movu [tmpq+wq*0], m14
+ mova m14, m2
+ PMADDUBSW m14, subpel0, m7, m12, 0 ; b0
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
+ paddw m14, m4
+ mova m4, m6
+ PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
+ paddw m14, m6
+ mova m6, m15
+ PMADDUBSW m15, subpel3, m7, m12, 0 ; b3
+ paddw m14, m15
+ PMULHRSW_8192 m14, m14, [base+pw_2]
+ movu [tmpq+wq*2], m14
+%endif
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w8_loop
@@ -2857,20 +3359,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpel1
%undef subpel2
%undef subpel3
-
+ ;
.hv:
%assign stack_offset org_stack_offset
cmp wd, 4
jg .hv_w8
and mxd, 0x7f
- movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+ movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
%if ARCH_X86_32
mov mxd, myd
shr myd, 16
and mxd, 0x7f
cmp hd, 6
cmovs myd, mxd
- movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
mov r5, r2; use as new base
%define base_reg r5
%assign regs_used 2
@@ -2886,7 +3388,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpelv2 [rsp+mmsize*2]
%define subpelv3 [rsp+mmsize*3]
punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
+ psraw m0, 8
pshufd m6, m0, q0000
mova subpelv0, m6
pshufd m6, m0, q1111
@@ -2900,8 +3402,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
ALLOC_STACK mmsize*14, 14
+ %else
+ ALLOC_STACK mmsize*14, 16
+ %endif
lea stride3q, [strideq*3]
sub srcq, stride3q
dec srcq
@@ -2910,8 +3416,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpelv2 m12
%define subpelv3 m13
punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
+ psraw m0, 8
+ %if cpuflag(ssse3)
mova m8, [base+pw_8192]
+ %else
+ mova m8, [base+pw_2]
+ %endif
mova m9, [base+pd_32]
pshufd m10, m0, q0000
pshufd m11, m0, q1111
@@ -2919,7 +3429,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
pshufd m13, m0, q3333
%endif
pshufd m7, m1, q0000
-.hv_w4:
+%if notcpuflag(ssse3)
+ punpcklbw m7, m7
+ psraw m7, 8
+%endif
%define hv4_line_0_0 4
%define hv4_line_0_1 5
%define hv4_line_0_2 6
@@ -2930,17 +3443,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define hv4_line_1_1 11
%define hv4_line_1_2 12
%define hv4_line_1_3 13
- ;
- ;
%if ARCH_X86_32
- %define w8192reg [base+pw_8192]
+ %if cpuflag(ssse3)
+ %define w8192reg [base+pw_8192]
+ %else
+ %define w8192reg [base+pw_2]
+ %endif
%define d32reg [base+pd_32]
%else
%define w8192reg m8
%define d32reg m9
%endif
; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+ mova m15, [pw_1]
+ %else
+ %define m15 m1
+ %endif
+%endif
movq m5, [srcq+strideq*0] ; 0 _ _ _
movhps m5, [srcq+strideq*1] ; 0 _ 1 _
movq m4, [srcq+strideq*2] ; 2 _ _ _
@@ -2953,43 +3476,61 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
movhps m4, [srcq+stride3q ] ; 2 _ 3 _
lea srcq, [srcq+strideq*4]
%endif
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
+ PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
SAVELINE_W4 m2, 2, 0
; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4+16]
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
- ;
+%endif
+ PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m2
+ %else
+ mova [esp+mmsize*4], m2
+ %endif
+%endif
; lower shuffle
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4]
+%endif
movq m5, [srcq+strideq*0] ; 4 _ _ _
movhps m5, [srcq+strideq*1] ; 4 _ 5 _
movq m4, [srcq+strideq*2] ; 6 _ _ _
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
+ PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
SAVELINE_W4 m3, 3, 0
; upper shuffle
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4+16]
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
- ;
+%endif
+ PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m2, m14
+ %else
+ mova m2, [esp+mmsize*4]
+ %endif
+%endif
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
add srcq, strideq
@@ -2997,7 +3538,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
add srcq, stride3q
%endif
;process high
- palignr m4, m3, m2, 4;V 1 2 3 4
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
punpckhwd m2, m4 ; V 23 34
pshufd m0, m3, q2121;V 5 6 5 6
@@ -3009,7 +3550,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
;process low
RESTORELINE_W4 m2, 2, 0
RESTORELINE_W4 m3, 3, 0
- palignr m4, m3, m2, 4;V 1 2 3 4
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
punpckhwd m2, m4 ; V 23 34
pshufd m0, m3, q2121;V 5 6 5 6
@@ -3023,18 +3564,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+mmsize*4], m5
+ %define m15 m3
+ %endif
+%endif
;
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4]
+%endif
movq m4, [srcq+strideq*0] ; 7
movhps m4, [srcq+strideq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
+ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
mova m0, m4
punpcklwd m3, m4 ; 67 78
pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+mmsize*4]
+ %endif
+%endif
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m5, 6
@@ -3055,18 +3613,34 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+0xA0], m5
+ %endif
+%endif
;
+%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4+16]
+%endif
movq m4, [srcq+strideq*0] ; 7
movhps m4, [srcq+strideq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
+ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
mova m0, m4
punpcklwd m3, m4 ; 67 78
pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+0xA0]
+ %endif
+%endif
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m4, m5, 6
@@ -3093,8 +3667,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpelv2
%undef subpelv3
;
-
-
.hv_w8:
%assign stack_offset org_stack_offset
%define hv8_line_1 0
@@ -3113,27 +3685,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpelv3 [rsp+mmsize*10]
%define accuv0 [rsp+mmsize*11]
%define accuv1 [rsp+mmsize*12]
- movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
mov mxd, myd
shr myd, 16
and mxd, 0x7f
cmp hd, 6
cmovs myd, mxd
- movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
ALLOC_STACK -mmsize*13
-%if STACK_ALIGNMENT < mmsize
+ %if STACK_ALIGNMENT < mmsize
mov rstk, r2m
- %define tmpm [rsp+mmsize*13+gprsize*1]
- %define srcm [rsp+mmsize*13+gprsize*2]
- %define stridem [rsp+mmsize*13+gprsize*3]
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
mov stridem, rstk
-%endif
+ %endif
mov r6, r2
-%define base_reg r6
+ %define base_reg r6
pshufd m0, m1, q0000
pshufd m1, m1, q1111
punpcklbw m5, m5
- psraw m5, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ %endif
+ psraw m5, 8
+ %if notcpuflag(ssse3)
+ psraw m0, 8
+ psraw m1, 8
+ %endif
pshufd m2, m5, q0000
pshufd m3, m5, q1111
pshufd m4, m5, q2222
@@ -3160,20 +3740,31 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpelv3 m15
%define accuv0 m8
%define accuv1 m9
- movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
punpcklbw m1, m1
- psraw m1, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+ punpcklbw subpelh0, subpelh0
+ punpcklbw subpelh1, subpelh1
+ %endif
+ psraw m1, 8
+ %if notcpuflag(ssse3)
+ psraw subpelh0, 8
+ psraw subpelh1, 8
+ %endif
pshufd subpelv0, m1, q0000
pshufd subpelv1, m1, q1111
pshufd subpelv2, m1, q2222
pshufd subpelv3, m1, q3333
+ %if notcpuflag(ssse3)
+ mova m7, [base+pw_2]
+ %endif
lea stride3q, [strideq*3]
sub srcq, 3
sub srcq, stride3q
@@ -3188,57 +3779,89 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
shl r5d, (16 - 2)
mov r5w, hw
.hv_w8_loop0:
- movu m4, [srcq+strideq*0] ; 0 = _ _
- movu m5, [srcq+strideq*1] ; 1 = _ _
- lea srcq, [srcq+strideq*2]
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
mova m7, [base+subpel_h_shufA]
mova m8, [base+subpel_h_shufB]
mova m9, [base+subpel_h_shufC]
+ %define shufA m7
+ %define shufB m8
+ %define shufC m9
+ %else
+ %define shufA [base+subpel_h_shufA]
+ %define shufB [base+subpel_h_shufB]
+ %define shufC [base+subpel_h_shufC]
+ %endif
%endif
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
- movu m6, [srcq+strideq*0] ; 2 = _ _
- movu m0, [srcq+strideq*1] ; 3 = _ _
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
lea srcq, [srcq+strideq*2]
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
- ;
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m9, m4
+ %else
+ mova [esp], m4
+ %endif
+%endif
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
+ PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
+ lea srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
mova m7, [base+pw_8192]
- pmulhrsw m4, m7 ; H pw_8192
- pmulhrsw m5, m7 ; H pw_8192
- pmulhrsw m6, m7 ; H pw_8192
- pmulhrsw m0, m7 ; H pw_8192
- punpcklwd m1, m4, m5 ; 0 1 ~
- punpcklwd m2, m5, m6 ; 1 2 ~
- punpcklwd m3, m6, m0 ; 2 3 ~
+%else
+ mova m7, [base+pw_2]
+ %if ARCH_X86_64
+ SWAP m4, m9
+ %else
+ mova m4, [esp]
+ %endif
+%endif
+ PMULHRSW_8192 m4, m4, m7
+ PMULHRSW_8192 m5, m5, m7
+ PMULHRSW_8192 m6, m6, m7
+ PMULHRSW_8192 m0, m0, m7
+ punpcklwd m1, m4, m5 ; 01
+ punpcklwd m2, m5, m6 ; 12
+ punpcklwd m3, m6, m0 ; 23
SAVELINE_W8 1, m1
SAVELINE_W8 2, m2
SAVELINE_W8 3, m3
- ;
+%if cpuflag(ssse3)
mova m7, [base+subpel_h_shufA]
- movu m4, [srcq+strideq*0] ; 4 = _ _
- movu m5, [srcq+strideq*1] ; 5 = _ _
+%else
+ %if ARCH_X86_64
+ SWAP m8, m7
+ SWAP m9, m0
+ %else
+ mova [esp+0x30], m0
+ %endif
+%endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
lea srcq, [srcq+strideq*2]
- movu m6, [srcq+strideq*0] ; 6 = _ _
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+%if cpuflag(ssse3)
mova m7, [base+pw_8192]
- pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
- pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
- pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
- punpcklwd m4, m0, m1 ; 3 4 ~
- punpcklwd m5, m1, m2 ; 4 5 ~
- punpcklwd m6, m2, m3 ; 5 6 ~
- ;
+%else
+ %if ARCH_X86_64
+ SWAP m0, m9
+ SWAP m7, m8
+ %else
+ mova m0, [esp+0x30]
+ mova m7, [base+pw_2]
+ %endif
+%endif
+ PMULHRSW_8192 m1, m4, m7
+ PMULHRSW_8192 m2, m5, m7
+ PMULHRSW_8192 m3, m6, m7
+ punpcklwd m4, m0, m1 ; 34
+ punpcklwd m5, m1, m2 ; 45
+ punpcklwd m6, m2, m3 ; 56
SAVELINE_W8 6, m3
RESTORELINE_W8 1, m1
RESTORELINE_W8 2, m2
RESTORELINE_W8 3, m3
.hv_w8_loop:
- ; m8 accu for V a
- ; m9 accu for V b
SAVELINE_W8 1, m3
SAVELINE_W8 2, m4
SAVELINE_W8 3, m5
@@ -3255,46 +3878,53 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
paddd m0, m5
paddd m7, m6
mova m5, [base+pd_32]
- paddd m0, m5 ; pd_512
- paddd m7, m5 ; pd_512
+ paddd m0, m5
+ paddd m7, m5
mova accuv0, m0
mova accuv1, m7
%else
- pmaddwd m8, m1, subpelv0 ; a0
- pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd accuv0, m1, subpelv0 ; a0
+ pmaddwd accuv1, m2, subpelv0 ; b0
pmaddwd m3, subpelv1 ; a1
pmaddwd m4, subpelv1 ; b1
- paddd m8, m3
- paddd m9, m4
+ paddd accuv0, m3
+ paddd accuv1, m4
pmaddwd m5, subpelv2 ; a2
pmaddwd m6, subpelv2 ; b2
- paddd m8, m5
- paddd m9, m6
+ paddd accuv0, m5
+ paddd accuv1, m6
mova m7, [base+pd_32]
- paddd m8, m7 ; pd_512
- paddd m9, m7 ; pd_512
+ paddd accuv0, m7
+ paddd accuv1, m7
+ %if cpuflag(ssse3)
mova m7, [base+subpel_h_shufB]
mova m6, [base+subpel_h_shufC]
mova m5, [base+subpel_h_shufA]
+ %define shufA m5
+ %define shufB m7
+ %define shufC m6
+ %endif
%endif
- movu m0, [srcq+strideq*1] ; 7
- movu m4, [srcq+strideq*2] ; 8
+ PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
+ PREP_8TAP_HV m4, srcq+strideq*2, m5, m6
lea srcq, [srcq+strideq*2]
- HV_H_W8 m0, m1, m2, m3, m5, m7, m6
- HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+%if cpuflag(ssse3)
mova m5, [base+pw_8192]
- pmulhrsw m0, m5 ; H pw_8192
- pmulhrsw m4, m5 ; H pw_8192
+%else
+ mova m5, [base+pw_2]
+%endif
+ PMULHRSW_8192 m0, m0, m5
+ PMULHRSW_8192 m4, m4, m5
RESTORELINE_W8 6, m6
- punpcklwd m5, m6, m0 ; 6 7 ~
- punpcklwd m6, m0, m4 ; 7 8 ~
+ punpcklwd m5, m6, m0 ; 67
+ punpcklwd m6, m0, m4 ; 78
pmaddwd m1, m5, subpelv3 ; a3
paddd m2, m1, accuv0
pmaddwd m1, m6, subpelv3 ; b3
- paddd m1, m1, accuv1 ; H + V
+ paddd m1, m1, accuv1
psrad m2, 6
psrad m1, 6
- packssdw m2, m1 ; d -> w
+ packssdw m2, m1
movq [tmpq+wq*0], m2
movhps [tmpq+wq*2], m2
lea tmpq, [tmpq+wq*4]
@@ -3323,6 +3953,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
sub r5d, 1<<16
jg .hv_w8_loop0
RET
+%endmacro
%if ARCH_X86_32
%macro SAVE_ALPHA_BETA 0
@@ -3393,7 +4024,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%endmacro
%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
- ; Can be done using gathers, but that's terribly slow on many CPU:s
%if ARCH_X86_32
%define m8 m4
%define m9 m5
@@ -4031,20 +4661,6 @@ ALIGN function_align
ret
%endmacro
-INIT_XMM sse4
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM sse2
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-
%if WIN64
DECLARE_REG_TMP 6, 4
%else
@@ -5091,7 +5707,6 @@ cextern resize_filter
%endif
%endmacro
-INIT_XMM ssse3
%if ARCH_X86_64
cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0
@@ -5302,3 +5917,19 @@ cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
%endif
jg .loop_y
RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm b/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm
index f6787148392..756e19b4bb9 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm
@@ -157,7 +157,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov [t7+msac.rng], t2d
not t4
sub t1d, ecx
- jge .end ; no refill required
+ jae .end ; no refill required
; refill:
mov t2, [t7+msac.buf]
@@ -504,7 +504,7 @@ cglobal msac_decode_bool, 0, 6, 0
mov [t7+msac.rng], t2d
not t4
sub t5d, ecx
- jge %%end
+ jae %%end
mov t2, [t7+msac.buf]
mov rcx, [t7+msac.end]
%if UNIX64 == 0
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
index a9dafc757ce..a634da27c4e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
@@ -28,6 +28,7 @@
#include "src/msac.h"
#include "src/x86/msac.h"
+#if ARCH_X86_64
void dav1d_msac_init_x86(MsacContext *const s) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -39,4 +40,4 @@ void dav1d_msac_init_x86(MsacContext *const s) {
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
}
}
-
+#endif
diff --git a/chromium/third_party/dav1d/libdav1d/tools/dav1d.c b/chromium/third_party/dav1d/libdav1d/tools/dav1d.c
index 97c78014695..4b97a9f20f3 100644
--- a/chromium/third_party/dav1d/libdav1d/tools/dav1d.c
+++ b/chromium/third_party/dav1d/libdav1d/tools/dav1d.c
@@ -63,7 +63,9 @@ static uint64_t get_time_nanos(void) {
QueryPerformanceFrequency(&frequency);
LARGE_INTEGER t;
QueryPerformanceCounter(&t);
- return 1000000000 * t.QuadPart / frequency.QuadPart;
+ uint64_t seconds = t.QuadPart / frequency.QuadPart;
+ uint64_t fractions = t.QuadPart % frequency.QuadPart;
+ return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart;
#elif defined(HAVE_CLOCK_GETTIME)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -245,7 +247,7 @@ int main(const int argc, char *const *const argv) {
if ((res = output_write(out, &p)) < 0)
break;
n_out++;
- if (nspf) {
+ if (nspf || !cli_settings.quiet) {
synchronize(cli_settings.realtime, cli_settings.realtime_cache,
n_out, nspf, tfirst, &elapsed, frametimes);
}
@@ -282,7 +284,7 @@ int main(const int argc, char *const *const argv) {
if ((res = output_write(out, &p)) < 0)
break;
n_out++;
- if (nspf) {
+ if (nspf || !cli_settings.quiet) {
synchronize(cli_settings.realtime, cli_settings.realtime_cache,
n_out, nspf, tfirst, &elapsed, frametimes);
}
diff --git a/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c b/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c
index 4221feee077..f363033edae 100644
--- a/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c
+++ b/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c
@@ -118,7 +118,7 @@ static void usage(const char *const app, const char *const reason, ...) {
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n"
- " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n"
+ " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n"
" --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
" --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n"
" --verify $md5: verify decoded md5. implies --muxer md5, no output\n"
diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/input.c b/chromium/third_party/dav1d/libdav1d/tools/input/input.c
index d8a56c1822f..3ed6983acee 100644
--- a/chromium/third_party/dav1d/libdav1d/tools/input/input.c
+++ b/chromium/third_party/dav1d/libdav1d/tools/input/input.c
@@ -82,6 +82,10 @@ int input_open(DemuxerContext **const c_out,
return DAV1D_ERR(ENOMEM);
}
FILE *f = fopen(filename, "rb");
+ if (!f) {
+ fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno));
+ return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
+ }
res = !!fread(probe_data, 1, probe_sz, f);
fclose(f);
if (!res) {
diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c b/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c
index 746391d4c12..7b572ee73c5 100644
--- a/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c
+++ b/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c
@@ -28,6 +28,7 @@
#include "config.h"
#include <errno.h>
+#include <limits.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
@@ -92,8 +93,27 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
break; // EOF
fseeko(c->f, rl32(data) + 8, SEEK_CUR);
}
- fps[0] = timebase[0] * *num_frames;
- fps[1] = timebase[1] * duration;
+
+ uint64_t fps_num = (uint64_t) timebase[0] * *num_frames;
+ uint64_t fps_den = (uint64_t) timebase[1] * duration;
+ if (fps_num && fps_den) { /* Reduce fraction */
+ uint64_t gcd = fps_num;
+ for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b);
+ fps_num /= gcd;
+ fps_den /= gcd;
+
+ while ((fps_num | fps_den) > UINT_MAX) {
+ fps_num >>= 1;
+ fps_den >>= 1;
+ }
+ }
+ if (fps_num && fps_den) {
+ fps[0] = (unsigned) fps_num;
+ fps[1] = (unsigned) fps_den;
+ } else {
+ fps[0] = fps[1] = 0;
+ }
+
fseeko(c->f, 32, SEEK_SET);
return 0;
diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/parse.h b/chromium/third_party/dav1d/libdav1d/tools/input/parse.h
index bebea21daf7..f5805e8ca45 100644
--- a/chromium/third_party/dav1d/libdav1d/tools/input/parse.h
+++ b/chromium/third_party/dav1d/libdav1d/tools/input/parse.h
@@ -29,22 +29,24 @@
#ifndef DAV1D_INPUT_PARSE_H
#define DAV1D_INPUT_PARSE_H
+#include <limits.h>
+
#include "dav1d/headers.h"
static int leb128(FILE *const f, size_t *const len) {
+ uint64_t val = 0;
unsigned i = 0, more;
- *len = 0;
do {
- uint8_t byte;
- if (fread(&byte, 1, 1, f) < 1)
+ uint8_t v;
+ if (fread(&v, 1, 1, f) < 1)
return -1;
- more = byte & 0x80;
- const unsigned bits = byte & 0x7f;
- if (i <= 3 || (i == 4 && bits < (1 << 4)))
- *len |= bits << (i * 7);
- else if (bits) return -1;
- if (++i == 8 && more) return -1;
- } while (more);
+ more = v & 0x80;
+ val |= ((uint64_t) (v & 0x7F)) << (i * 7);
+ i++;
+ } while (more && i < 8);
+ if (val > UINT_MAX || more)
+ return -1;
+ *len = (size_t) val;
return i;
}
@@ -52,18 +54,18 @@ static int leb128(FILE *const f, size_t *const len) {
// with author's permission
static int leb(const uint8_t *ptr, int sz, size_t *const len) {
+ uint64_t val = 0;
unsigned i = 0, more;
- *len = 0;
do {
if (!sz--) return -1;
- const int byte = *ptr++;
- more = byte & 0x80;
- const unsigned bits = byte & 0x7f;
- if (i <= 3 || (i == 4 && bits < (1 << 4)))
- *len |= bits << (i * 7);
- else if (bits) return -1;
- if (++i == 8 && more) return -1;
- } while (more);
+ const int v = *ptr++;
+ more = v & 0x80;
+ val |= ((uint64_t) (v & 0x7F)) << (i * 7);
+ i++;
+ } while (more && i < 8);
+ if (val > UINT_MAX || more)
+ return -1;
+ *len = (size_t) val;
return i;
}