diff options
Diffstat (limited to 'chromium/third_party/dav1d')
52 files changed, 12595 insertions, 1611 deletions
diff --git a/chromium/third_party/dav1d/dav1d_generated.gni b/chromium/third_party/dav1d/dav1d_generated.gni index 8becac02308..9e15c31fc28 100644 --- a/chromium/third_party/dav1d/dav1d_generated.gni +++ b/chromium/third_party/dav1d/dav1d_generated.gni @@ -20,7 +20,7 @@ x86_asm_sources = [ "libdav1d/src/x86/looprestoration.asm", "libdav1d/src/x86/looprestoration_ssse3.asm", "libdav1d/src/x86/mc.asm", - "libdav1d/src/x86/mc_ssse3.asm", + "libdav1d/src/x86/mc_sse.asm", "libdav1d/src/x86/msac.asm", ] @@ -37,9 +37,11 @@ x86_template_sources = [ arm32_asm_sources = [ "libdav1d/src/arm/32/cdef.S", "libdav1d/src/arm/32/ipred.S", + "libdav1d/src/arm/32/itx.S", "libdav1d/src/arm/32/loopfilter.S", "libdav1d/src/arm/32/looprestoration.S", "libdav1d/src/arm/32/mc.S", + "libdav1d/src/arm/32/msac.S", "libdav1d/src/arm/32/util.S", ] @@ -50,6 +52,7 @@ arm64_asm_sources = [ "libdav1d/src/arm/64/ipred.S", "libdav1d/src/arm/64/ipred16.S", "libdav1d/src/arm/64/itx.S", + "libdav1d/src/arm/64/itx16.S", "libdav1d/src/arm/64/loopfilter.S", "libdav1d/src/arm/64/loopfilter16.S", "libdav1d/src/arm/64/looprestoration.S", diff --git a/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml b/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml index bdef928a40d..c921b6a122f 100644 --- a/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml +++ b/chromium/third_party/dav1d/libdav1d/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: - test .debian-amd64-common: - image: registry.videolan.org/dav1d-debian-unstable:20200306210534 + image: registry.videolan.org/dav1d-debian-unstable:20200602183013 stage: build tags: - docker @@ -52,6 +52,7 @@ stages: - docker - amd64 + style-check: extends: .debian-amd64-common stage: style @@ -80,6 +81,7 @@ style-check: fi; done + build-debian: extends: .debian-amd64-common tags: @@ -91,6 +93,10 @@ build-debian: --werror - ninja -C build - cd build && meson test -v + artifacts: + paths: + - build/ + expire_in: 1 day build-debian-static: extends: .debian-amd64-common @@ -110,6 +116,10 @@ build-debian32: --cross-file package/crossfiles/i686-linux32.meson - ninja -C build - cd build && meson test -v + artifacts: + paths: + - build/ + expire_in: 1 day build-debian-examples: extends: .debian-amd64-common @@ -331,6 +341,7 @@ build-debian-ppc64le: - ninja -C build - cd build && meson test -v + .test-common: stage: test cache: @@ -344,6 +355,25 @@ build-debian-ppc64le: - git clone cache/dav1d-test-data.git tests/dav1d-test-data dependencies: [] +.test-asm-common: + extends: + - .debian-amd64-common + - .test-common + tags: + - docker + - amd64 + - avx2 + script: + - meson configure build -Dtestdata_tests=true + - cd build + - exit_code=0 + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2" || exit_code=$((exit_code + $?)) + - if [ $exit_code -ne 0 ]; then exit $exit_code; fi + test-debian: extends: - .debian-amd64-common @@ -353,8 +383,48 @@ test-debian: - meson build --buildtype release -Dtestdata_tests=true -Dlogging=false + -Db_coverage=true - ninja -C build - cd build && time meson test -v + - ninja coverage-html + - mv meson-logs/coveragereport ../coverage + - ninja coverage-xml + - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 | + grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } ' + coverage: '/^coverage: (\d+.\d+)$/' + artifacts: + expose_as: 'Coverage HTML report' + paths: + - coverage/ + reports: + cobertura: build/meson-logs/coverage.xml + +test-debian-asm: + extends: + - .test-asm-common + needs: ["build-debian"] + dependencies: ["build-debian"] + +test-debian32-asm: + extends: + - .test-asm-common + needs: ["build-debian32"] + dependencies: ["build-debian32"] + +test-debian-mt: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + dependencies: ["build-debian"] + script: + - meson configure build -Dtestdata_tests=true + - cd build + - exit_code=0 + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?)) + - if [ $exit_code -ne 0 ]; then exit $exit_code; fi test-debian-unaligned-stack: extends: @@ -482,6 +552,7 @@ test-debian-armv7-clang-5: - ninja -C build - cd build && time meson test -v + .pages-common: extends: .debian-amd64-common script: diff --git a/chromium/third_party/dav1d/libdav1d/NEWS b/chromium/third_party/dav1d/libdav1d/NEWS index 46695fd7ea2..1294dc52caf 100644 --- a/chromium/third_party/dav1d/libdav1d/NEWS +++ b/chromium/third_party/dav1d/libdav1d/NEWS @@ -1,3 +1,33 @@ +Changes for 0.7.1 'Frigatebird': +------------------------------ + +0.7.1 is a minor update on 0.7.0: + - ARM32 NEON optimizations for itxfm, which can give up to 28% speedup, and MSAC + - SSE2 optimizations for prep_bilin and prep_8tap + - AVX2 optimizations for MC scaled + - Fix a clamping issue in motion vector projection + - Fix an issue on some specific Haswell CPU on ipred_z AVX2 functions + - Improvements on the dav1dplay utility player to support resizing + + +Changes for 0.7.0 'Frigatebird': +------------------------------ + +0.7.0 is a major release for dav1d: + - Faster refmv implementation gaining up to 12% speed while -25% of RAM (Single Thread) + - 10b/12b ARM64 optimizations are mostly complete: + - ipred (paeth, smooth, dc, pal, filter, cfl) + - itxfm (only 10b) + - AVX2/SSSE3 for non-4:2:0 film grain and for mc.resize + - AVX2 for cfl4:4:4 + - AVX-512 CDEF filter + - ARM64 8b improvements for cfl_ac and itxfm + - ARM64 implementation for emu_edge in 8b/10b/12b + - ARM32 implementation for emu_edge in 8b + - Improvements on the dav1dplay utility player to support 10 bit, + non-4:2:0 pixel formats and film grain on the GPU + + Changes for 0.6.0 'Gyrfalcon': ------------------------------ diff --git a/chromium/third_party/dav1d/libdav1d/doc/meson.build b/chromium/third_party/dav1d/libdav1d/doc/meson.build index 4badbf6ea91..0ef7123448a 100644 --- a/chromium/third_party/dav1d/libdav1d/doc/meson.build +++ b/chromium/third_party/dav1d/libdav1d/doc/meson.build @@ -27,8 +27,8 @@ dot = find_program('dot', required: false) if doxygen.found() and dot.found() conf_data = configuration_data() - conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d')) - conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include')) + conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d')) + conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include')) conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir()) doxyfile = configure_file(input: 'Doxyfile.in', output: 'Doxyfile', diff --git a/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c b/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c index bcd4835b320..d6bb262b56c 100644 --- a/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c +++ b/chromium/third_party/dav1d/libdav1d/examples/dav1dplay.c @@ -29,687 +29,18 @@ #include <getopt.h> #include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> #include <SDL.h> -#include "common/attributes.h" - #include "dav1d/dav1d.h" +#include "common/attributes.h" #include "tools/input/input.h" +#include "dp_fifo.h" +#include "dp_renderer.h" -/** - * Settings structure - * Hold all settings available for the player, - * this is usually filled by parsing arguments - * from the console. - */ -typedef struct { - const char *inputfile; - int highquality; - int untimed; - int zerocopy; -} Dav1dPlaySettings; - -#define WINDOW_WIDTH 910 -#define WINDOW_HEIGHT 512 - -#define DAV1D_EVENT_NEW_FRAME 1 -#define DAV1D_EVENT_DEC_QUIT 2 - -/* - * Fifo helper functions - */ -typedef struct dp_fifo -{ - SDL_mutex *lock; - SDL_cond *cond_change; - size_t capacity; - size_t count; - void **entries; -} Dav1dPlayPtrFifo; - -static void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo) -{ - assert(fifo->count == 0); - SDL_DestroyMutex(fifo->lock); - SDL_DestroyCond(fifo->cond_change); - free(fifo->entries); - free(fifo); -} - -static Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity) -{ - Dav1dPlayPtrFifo *fifo; - - assert(capacity > 0); - if (capacity <= 0) - return NULL; - - fifo = malloc(sizeof(*fifo)); - if (fifo == NULL) - return NULL; - - fifo->capacity = capacity; - fifo->count = 0; - - fifo->lock = SDL_CreateMutex(); - if (fifo->lock == NULL) { - free(fifo); - return NULL; - } - fifo->cond_change = SDL_CreateCond(); - if (fifo->cond_change == NULL) { - SDL_DestroyMutex(fifo->lock); - free(fifo); - return NULL; - } - - fifo->entries = calloc(capacity, sizeof(void*)); - if (fifo->entries == NULL) { - dp_fifo_destroy(fifo); - return NULL; - } - - return fifo; -} - -static void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element) -{ - SDL_LockMutex(fifo->lock); - while (fifo->count == fifo->capacity) - SDL_CondWait(fifo->cond_change, fifo->lock); - fifo->entries[fifo->count++] = element; - if (fifo->count == 1) - SDL_CondSignal(fifo->cond_change); - SDL_UnlockMutex(fifo->lock); -} - -static void *dp_fifo_array_shift(void **arr, size_t len) -{ - void *shifted_element = arr[0]; - for (size_t i = 1; i < len; ++i) - arr[i-1] = arr[i]; - return shifted_element; -} - -static void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo) -{ - SDL_LockMutex(fifo->lock); - while (fifo->count == 0) - SDL_CondWait(fifo->cond_change, fifo->lock); - void *res = dp_fifo_array_shift(fifo->entries, fifo->count--); - if (fifo->count == fifo->capacity - 1) - SDL_CondSignal(fifo->cond_change); - SDL_UnlockMutex(fifo->lock); - return res; -} - -/** - * Renderer info - */ -typedef struct rdr_info -{ - // Cookie passed to the renderer implementation callbacks - void *cookie; - // Callback to create the renderer - void* (*create_renderer)(void *data); - // Callback to destroy the renderer - void (*destroy_renderer)(void *cookie); - // Callback to the render function that renders a prevously sent frame - void (*render)(void *cookie, const Dav1dPlaySettings *settings); - // Callback to the send frame function - int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings); - // Callback for alloc/release pictures (optional) - int (*alloc_pic)(Dav1dPicture *pic, void *cookie); - void (*release_pic)(Dav1dPicture *pic, void *cookie); -} Dav1dPlayRenderInfo; - -#ifdef HAVE_PLACEBO_VULKAN - -#include <libplacebo/renderer.h> -#include <libplacebo/utils/upload.h> -#include <libplacebo/vulkan.h> -#include <SDL_vulkan.h> - - -/** - * Renderer context for libplacebo - */ -typedef struct renderer_priv_ctx -{ - // Placebo context - struct pl_context *ctx; - // Placebo renderer - struct pl_renderer *renderer; - // Placebo Vulkan handle - const struct pl_vulkan *vk; - // Placebo Vulkan instance - const struct pl_vk_inst *vk_inst; - // Vulkan surface - VkSurfaceKHR surf; - // Placebo swapchain - const struct pl_swapchain *swapchain; - // Lock protecting access to the texture - SDL_mutex *lock; - // Planes to render - struct pl_plane y_plane; - struct pl_plane u_plane; - struct pl_plane v_plane; - // Textures to render - const struct pl_tex *y_tex; - const struct pl_tex *u_tex; - const struct pl_tex *v_tex; -} Dav1dPlayRendererPrivateContext; - -static void *placebo_renderer_create(void *data) -{ - // Alloc - Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); - if (rd_priv_ctx == NULL) { - return NULL; - } - - // Init libplacebo - rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) { - .log_cb = pl_log_color, -#ifndef NDEBUG - .log_level = PL_LOG_DEBUG, -#else - .log_level = PL_LOG_WARN, -#endif - }); - if (rd_priv_ctx->ctx == NULL) { - free(rd_priv_ctx); - return NULL; - } - - // Create Mutex - rd_priv_ctx->lock = SDL_CreateMutex(); - if (rd_priv_ctx->lock == NULL) { - fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); - pl_context_destroy(&(rd_priv_ctx->ctx)); - free(rd_priv_ctx); - return NULL; - } - - // Init Vulkan - struct pl_vk_inst_params iparams = pl_vk_inst_default_params; - - SDL_Window *sdlwin = data; - - unsigned num = 0; - if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) { - fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError()); - exit(1); - } - - iparams.extensions = malloc(num * sizeof(const char *)); - iparams.num_extensions = num; - assert(iparams.extensions); - - SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, iparams.extensions); - if (!ok) { - fprintf(stderr, "Failed getting Vk instance extensions\n"); - exit(1); - } - - if (num > 0) { - printf("Requesting %d additional Vulkan extensions:\n", num); - for (unsigned i = 0; i < num; i++) - printf(" %s\n", iparams.extensions[i]); - } - - rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams); - if (!rd_priv_ctx->vk_inst) { - fprintf(stderr, "Failed creating Vulkan instance!\n"); - exit(1); - } - free(iparams.extensions); - - if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) { - fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError()); - exit(1); - } - - struct pl_vulkan_params params = pl_vulkan_default_params; - params.instance = rd_priv_ctx->vk_inst->instance; - params.surface = rd_priv_ctx->surf; - params.allow_software = true; - - rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, ¶ms); - if (!rd_priv_ctx->vk) { - fprintf(stderr, "Failed creating vulkan device!\n"); - exit(2); - } - - // Create swapchain - rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk, - &(struct pl_vulkan_swapchain_params) { - .surface = rd_priv_ctx->surf, - .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR, - }); - - if (!rd_priv_ctx->swapchain) { - fprintf(stderr, "Failed creating vulkan swapchain!\n"); - exit(2); - } - - int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; - if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { - fprintf(stderr, "Failed resizing vulkan swapchain!\n"); - exit(2); - } - - if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) - printf("Note: window dimensions differ (got %dx%d)\n", w, h); - - rd_priv_ctx->y_tex = NULL; - rd_priv_ctx->u_tex = NULL; - rd_priv_ctx->v_tex = NULL; - - rd_priv_ctx->renderer = NULL; - - return rd_priv_ctx; -} - -static void placebo_renderer_destroy(void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - pl_renderer_destroy(&(rd_priv_ctx->renderer)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_tex)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_tex)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_tex)); - pl_swapchain_destroy(&(rd_priv_ctx->swapchain)); - pl_vulkan_destroy(&(rd_priv_ctx->vk)); - vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); - pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); - pl_context_destroy(&(rd_priv_ctx->ctx)); -} - -static void placebo_render(void *cookie, const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - if (rd_priv_ctx->y_tex == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - // Prepare rendering - if (rd_priv_ctx->renderer == NULL) { - rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->vk->gpu); - } - - struct pl_swapchain_frame frame; - bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame); - if (!ok) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - const struct pl_tex *img = rd_priv_ctx->y_plane.texture; - struct pl_image image = { - .num_planes = 3, - .planes = { rd_priv_ctx->y_plane, rd_priv_ctx->u_plane, rd_priv_ctx->v_plane }, - .repr = pl_color_repr_hdtv, - .color = pl_color_space_unknown, - .width = img->params.w, - .height = img->params.h, - }; - - struct pl_render_params render_params = {0}; - if (settings->highquality) - render_params = pl_render_default_params; - - struct pl_render_target target; - pl_render_target_from_swapchain(&target, &frame); - target.profile = (struct pl_icc_profile) { - .data = NULL, - .len = 0, - }; - - if (!pl_render_image(rd_priv_ctx->renderer, &image, &target, &render_params)) { - fprintf(stderr, "Failed rendering frame!\n"); - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); - if (!ok) { - fprintf(stderr, "Failed submitting frame!\n"); - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - pl_swapchain_swap_buffers(rd_priv_ctx->swapchain); - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (dav1d_pic == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; - } - - int width = dav1d_pic->p.w; - int height = dav1d_pic->p.h; - - enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; - - if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { - fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); - exit(50); - } - - struct pl_plane_data data_y = { - .type = PL_FMT_UNORM, - .width = width, - .height = height, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[0], - .component_size = {8}, - .component_map = {0}, - }; - - struct pl_plane_data data_u = { - .type = PL_FMT_UNORM, - .width = width/2, - .height = height/2, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[1], - .component_size = {8}, - .component_map = {1}, - }; - - struct pl_plane_data data_v = { - .type = PL_FMT_UNORM, - .width = width/2, - .height = height/2, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[1], - .component_size = {8}, - .component_map = {2}, - }; - - if (settings->zerocopy) { - const struct pl_buf *buf = dav1d_pic->allocator_data; - assert(buf); - data_y.buf = data_u.buf = data_v.buf = buf; - data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data; - data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data; - data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data; - } else { - data_y.pixels = dav1d_pic->data[0]; - data_u.pixels = dav1d_pic->data[1]; - data_v.pixels = dav1d_pic->data[2]; - } - - bool ok = true; - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y); - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u); - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_plane), &(rd_priv_ctx->v_tex), &data_v); - - pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->u_plane.shift_x, &rd_priv_ctx->u_plane.shift_y); - pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->v_plane.shift_x, &rd_priv_ctx->v_plane.shift_y); - - if (!ok) { - fprintf(stderr, "Failed uploading planes!\n"); - } - - SDL_UnlockMutex(rd_priv_ctx->lock); - return !ok; -} - -// Align to power of 2 -#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) - -static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - SDL_LockMutex(rd_priv_ctx->lock); - - const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu; - int ret = DAV1D_ERR(ENOMEM); - - // Copied from dav1d_default_picture_alloc - const int hbd = p->p.bpc > 8; - const int aligned_w = ALIGN2(p->p.w, 128); - const int aligned_h = ALIGN2(p->p.h, 128); - const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; - const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; - p->stride[0] = aligned_w << hbd; - p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; - - // Align strides up to multiples of the GPU performance hints - p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride); - p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride); - - // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2) - size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4); - const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align); - const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); - - // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, - // even in the case that the driver gives us insane alignments - const size_t pic_size = y_sz + 2 * uv_sz; - const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; - - // Validate size limitations - if (total_size > gpu->limits.max_xfer_size) { - printf("alloc of %zu bytes exceeds limits\n", total_size); - goto err; - } - - const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) { - .type = PL_BUF_TEX_TRANSFER, - .host_mapped = true, - .size = total_size, - .memory_type = PL_BUF_MEM_HOST, - .user_data = p, - }); - - if (!buf) { - printf("alloc of GPU mapped buffer failed\n"); - goto err; - } - - assert(buf->data); - uintptr_t base = (uintptr_t) buf->data, data[3]; - data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); - data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); - data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); - - // Sanity check offset alignment for the sake of debugging - if (data[0] - base != ALIGN2(data[0] - base, off_align) || - data[1] - base != ALIGN2(data[1] - base, off_align) || - data[2] - base != ALIGN2(data[2] - base, off_align)) - { - printf("GPU buffer horribly misaligned, expect slowdown!\n"); - } - - p->allocator_data = (void *) buf; - p->data[0] = (void *) data[0]; - p->data[1] = (void *) data[1]; - p->data[2] = (void *) data[2]; - ret = 0; - - // fall through -err: - SDL_UnlockMutex(rd_priv_ctx->lock); - return ret; -} - -static void placebo_release_pic(Dav1dPicture *pic, void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - assert(pic->allocator_data); - - SDL_LockMutex(rd_priv_ctx->lock); - const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu; - pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data); - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static const Dav1dPlayRenderInfo renderer_info = { - .create_renderer = placebo_renderer_create, - .destroy_renderer = placebo_renderer_destroy, - .render = placebo_render, - .update_frame = placebo_upload_planes, - .alloc_pic = placebo_alloc_pic, - .release_pic = placebo_release_pic, -}; - -#else - -/** - * Renderer context for SDL - */ -typedef struct renderer_priv_ctx -{ - // SDL renderer - SDL_Renderer *renderer; - // Lock protecting access to the texture - SDL_mutex *lock; - // Texture to render - SDL_Texture *tex; -} Dav1dPlayRendererPrivateContext; - -static void *sdl_renderer_create(void *data) -{ - SDL_Window *win = data; - - // Alloc - Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); - if (rd_priv_ctx == NULL) { - return NULL; - } - - // Create renderer - rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED); - // Set scale quality - SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear"); - - // Create Mutex - rd_priv_ctx->lock = SDL_CreateMutex(); - if (rd_priv_ctx->lock == NULL) { - fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); - free(rd_priv_ctx); - return NULL; - } - - rd_priv_ctx->tex = NULL; - - return rd_priv_ctx; -} - -static void sdl_renderer_destroy(void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_DestroyRenderer(rd_priv_ctx->renderer); - SDL_DestroyMutex(rd_priv_ctx->lock); - free(rd_priv_ctx); -} - -static void sdl_render(void *cookie, const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (rd_priv_ctx->tex == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - // Display the frame - SDL_RenderClear(rd_priv_ctx->renderer); - SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL); - SDL_RenderPresent(rd_priv_ctx->renderer); - - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (dav1d_pic == NULL) { - rd_priv_ctx->tex = NULL; - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; - } - - int width = dav1d_pic->p.w; - int height = dav1d_pic->p.h; - int tex_w = width; - int tex_h = height; - - enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; - - if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { - fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); - exit(50); - } - - SDL_Texture *texture = rd_priv_ctx->tex; - if (texture != NULL) { - SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h); - if (tex_w != width || tex_h != height) { - SDL_DestroyTexture(texture); - texture = NULL; - } - } - - if (texture == NULL) { - texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, - SDL_TEXTUREACCESS_STREAMING, width, height); - } - - SDL_UpdateYUVTexture(texture, NULL, - dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y - dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U - dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V - ); - - rd_priv_ctx->tex = texture; - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; -} - -static const Dav1dPlayRenderInfo renderer_info = { - .create_renderer = sdl_renderer_create, - .destroy_renderer = sdl_renderer_destroy, - .render = sdl_render, - .update_frame = sdl_update_texture -}; - -#endif +// Selected renderer callbacks and cookie +static const Dav1dPlayRenderInfo *renderer_info = { NULL }; /** * Render context structure @@ -722,8 +53,6 @@ typedef struct render_context Dav1dPlaySettings settings; Dav1dSettings lib_settings; - // Renderer callbacks - Dav1dPlayRenderInfo *renderer_info; // Renderer private data (passed to callbacks) void *rd_priv; @@ -768,7 +97,9 @@ static void dp_settings_print_usage(const char *const app, " --tilethreads $num: number of tile threads (default: 1)\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" - " --version/-v: print version and exit\n"); + " --gpugrain/-g: enable GPU grain synthesis\n" + " --version/-v: print version and exit\n" + " --renderer/-r: select renderer backend (default: auto)\n"); exit(1); } @@ -791,7 +122,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, Dav1dSettings *lib_settings = &rd_ctx->lib_settings; // Short options - static const char short_opts[] = "i:vuz"; + static const char short_opts[] = "i:vuzgr:"; enum { ARG_FRAME_THREADS = 256, @@ -808,6 +139,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, + { "gpugrain", 0, NULL, 'g' }, + { "renderer", 0, NULL, 'r'}, { NULL, 0, NULL, 0 }, }; @@ -824,15 +157,15 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, break; case ARG_HIGH_QUALITY: settings->highquality = true; -#ifndef HAVE_PLACEBO_VULKAN - fprintf(stderr, "warning: --highquality requires libplacebo\n"); -#endif break; case 'z': settings->zerocopy = true; -#ifndef HAVE_PLACEBO_VULKAN - fprintf(stderr, "warning: --zerocopy requires libplacebo\n"); -#endif + break; + case 'g': + settings->gpugrain = true; + break; + case 'r': + settings->renderer_name = optarg; break; case ARG_FRAME_THREADS: lib_settings->n_frame_threads = @@ -852,6 +185,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, "Extra/unused arguments found, e.g. '%s'\n", argv[optind]); if (!settings->inputfile) dp_settings_print_usage(argv[0], "Input file (-i/--input) is required"); + if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0) + settings->renderer_name = NULL; } /** @@ -861,7 +196,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx) { assert(rd_ctx != NULL); - renderer_info.destroy_renderer(rd_ctx->rd_priv); + renderer_info->destroy_renderer(rd_ctx->rd_priv); dp_fifo_destroy(rd_ctx->fifo); SDL_DestroyMutex(rd_ctx->lock); free(rd_ctx); @@ -873,7 +208,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx) * \note The Dav1dPlayRenderContext must be destroyed * again by using dp_rd_ctx_destroy. */ -static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) +static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv) { Dav1dPlayRenderContext *rd_ctx; @@ -907,7 +242,22 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) return NULL; } - rd_ctx->rd_priv = renderer_info.create_renderer(rd_data); + // Parse and validate arguments + dav1d_default_settings(&rd_ctx->lib_settings); + memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); + dp_rd_ctx_parse_args(rd_ctx, argc, argv); + + // Select renderer + renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name); + + if (renderer_info == NULL) { + printf("No suitable rendered matching %s found.\n", + (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto"); + } else { + printf("Using %s renderer\n", renderer_info->name); + } + + rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL; if (rd_ctx->rd_priv == NULL) { SDL_DestroyMutex(rd_ctx->lock); dp_fifo_destroy(rd_ctx->fifo); @@ -915,9 +265,6 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) return NULL; } - dav1d_default_settings(&rd_ctx->lib_settings); - memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); - rd_ctx->last_pts = 0; rd_ctx->last_ticks = 0; rd_ctx->current_pts = 0; @@ -949,7 +296,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code) static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx, Dav1dPicture *dav1d_pic) { - renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings); + renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings); rd_ctx->current_pts = dav1d_pic->m.timestamp; } @@ -1004,7 +351,7 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx) fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000); } - renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings); + renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings); rd_ctx->last_ticks = SDL_GetTicks(); } @@ -1152,7 +499,6 @@ cleanup: int main(int argc, char **argv) { SDL_Thread *decoder_thread; - SDL_Window *win = NULL; // Check for version mismatch between library and tool const char *version = dav1d_version(); @@ -1166,34 +512,30 @@ int main(int argc, char **argv) if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) return 10; - // Create Window and Renderer - int window_flags = SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI; -#ifdef HAVE_PLACEBO_VULKAN - window_flags |= SDL_WINDOW_VULKAN; -#endif - win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, - WINDOW_WIDTH, WINDOW_HEIGHT, window_flags); - SDL_SetWindowResizable(win, SDL_TRUE); - // Create render context - Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(win); + Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv); if (rd_ctx == NULL) { fprintf(stderr, "Failed creating render context\n"); return 5; } - // Parse and validate arguments - dp_rd_ctx_parse_args(rd_ctx, argc, argv); - if (rd_ctx->settings.zerocopy) { - if (renderer_info.alloc_pic) { + if (renderer_info->alloc_pic) { rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) { .cookie = rd_ctx->rd_priv, - .alloc_picture_callback = renderer_info.alloc_pic, - .release_picture_callback = renderer_info.release_pic, + .alloc_picture_callback = renderer_info->alloc_pic, + .release_picture_callback = renderer_info->release_pic, }; } else { - fprintf(stderr, "--zerocopy unsupported by compiled renderer\n"); + fprintf(stderr, "--zerocopy unsupported by selected renderer\n"); + } + } + + if (rd_ctx->settings.gpugrain) { + if (renderer_info->supports_gpu_grain) { + rd_ctx->lib_settings.apply_grain = 0; + } else { + fprintf(stderr, "--gpugrain unsupported by selected renderer\n"); } } @@ -1207,6 +549,10 @@ int main(int argc, char **argv) if (SDL_WaitEvent(&e)) { if (e.type == SDL_QUIT) { dp_rd_ctx_request_shutdown(rd_ctx); + } else if (e.type == SDL_WINDOWEVENT) { + if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { + // TODO: Handle window resizes + } } else if (e.type == rd_ctx->renderer_event_type) { if (e.user.code == DAV1D_EVENT_NEW_FRAME) { // Dequeue frame and update the render context with it @@ -1232,7 +578,6 @@ int main(int argc, char **argv) SDL_WaitThread(decoder_thread, &decoder_ret); dp_rd_ctx_destroy(rd_ctx); - SDL_DestroyWindow(win); return decoder_ret; } diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c new file mode 100644 index 00000000000..243d2e933bc --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.c @@ -0,0 +1,123 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <SDL.h> +#include <assert.h> + +#include "dp_fifo.h" + +// FIFO structure +struct dp_fifo +{ + SDL_mutex *lock; + SDL_cond *cond_change; + size_t capacity; + size_t count; + void **entries; +}; + + +Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity) +{ + Dav1dPlayPtrFifo *fifo; + + assert(capacity > 0); + if (capacity <= 0) + return NULL; + + fifo = malloc(sizeof(*fifo)); + if (fifo == NULL) + return NULL; + + fifo->capacity = capacity; + fifo->count = 0; + + fifo->lock = SDL_CreateMutex(); + if (fifo->lock == NULL) { + free(fifo); + return NULL; + } + fifo->cond_change = SDL_CreateCond(); + if (fifo->cond_change == NULL) { + SDL_DestroyMutex(fifo->lock); + free(fifo); + return NULL; + } + + fifo->entries = calloc(capacity, sizeof(void*)); + if (fifo->entries == NULL) { + dp_fifo_destroy(fifo); + return NULL; + } + + return fifo; +} + +// Destroy FIFO +void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo) +{ + assert(fifo->count == 0); + SDL_DestroyMutex(fifo->lock); + SDL_DestroyCond(fifo->cond_change); + free(fifo->entries); + free(fifo); +} + +// Push to FIFO +void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element) +{ + SDL_LockMutex(fifo->lock); + while (fifo->count == fifo->capacity) + SDL_CondWait(fifo->cond_change, fifo->lock); + fifo->entries[fifo->count++] = element; + if (fifo->count == 1) + SDL_CondSignal(fifo->cond_change); + SDL_UnlockMutex(fifo->lock); +} + +// Helper that shifts the FIFO array +static void *dp_fifo_array_shift(void **arr, size_t len) +{ + void *shifted_element = arr[0]; + for (size_t i = 1; i < len; ++i) + arr[i-1] = arr[i]; + return shifted_element; +} + +// Get item from FIFO +void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo) +{ + SDL_LockMutex(fifo->lock); + while (fifo->count == 0) + SDL_CondWait(fifo->cond_change, fifo->lock); + void *res = dp_fifo_array_shift(fifo->entries, fifo->count--); + if (fifo->count == fifo->capacity - 1) + SDL_CondSignal(fifo->cond_change); + SDL_UnlockMutex(fifo->lock); + return res; +} + + diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h new file mode 100644 index 00000000000..a94b089b20c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/examples/dp_fifo.h @@ -0,0 +1,61 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Dav1dPlay FIFO helper + */ + +typedef struct dp_fifo Dav1dPlayPtrFifo; + +/* Create a FIFO + * + * Creates a FIFO with the given capacity. + * If the capacity is reached, new inserts into the FIFO + * will block until enough space is available again. + */ +Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity); + +/* Destroy a FIFO + * + * The FIFO must be empty before it is destroyed! + */ +void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo); + +/* Shift FIFO + * + * Return the first item from the FIFO, thereby removing it from + * the FIFO and making room for new entries. + */ +void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo); + +/* Push to FIFO + * + * Add an item to the end of the FIFO. + * If the FIFO is full, this call will block until there is again enough + * space in the FIFO, so calling this from the "consumer" thread if no + * other thread will call dp_fifo_shift will lead to a deadlock. + */ +void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element); diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h new file mode 100644 index 00000000000..4c6f2954f7a --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer.h @@ -0,0 +1,132 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <inttypes.h> +#include <string.h> + +#include "dav1d/dav1d.h" + +#include <SDL.h> +#ifdef HAVE_PLACEBO +# include <libplacebo/config.h> +#endif + +// Check libplacebo Vulkan rendering +#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN) +# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN +# define HAVE_RENDERER_PLACEBO +# define HAVE_PLACEBO_VULKAN +# endif +#endif + +// Check libplacebo OpenGL rendering +#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL +# define HAVE_RENDERER_PLACEBO +# define HAVE_PLACEBO_OPENGL +#endif + +/** + * Settings structure + * Hold all settings available for the player, + * this is usually filled by parsing arguments + * from the console. + */ +typedef struct { + const char *inputfile; + const char *renderer_name; + int highquality; + int untimed; + int zerocopy; + int gpugrain; +} Dav1dPlaySettings; + +#define WINDOW_WIDTH 910 +#define WINDOW_HEIGHT 512 + +#define DAV1D_EVENT_NEW_FRAME 1 +#define DAV1D_EVENT_DEC_QUIT 2 + +/** + * Renderer info + */ +typedef struct rdr_info +{ + // Renderer name + const char *name; + // Cookie passed to the renderer implementation callbacks + void *cookie; + // Callback to create the renderer + void* (*create_renderer)(); + // Callback to destroy the renderer + void (*destroy_renderer)(void *cookie); + // Callback to the render function that renders a prevously sent frame + void (*render)(void *cookie, const Dav1dPlaySettings *settings); + // Callback to the send frame function + int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings); + // Callback for alloc/release pictures (optional) + int (*alloc_pic)(Dav1dPicture *pic, void *cookie); + void (*release_pic)(Dav1dPicture *pic, void *cookie); + // Whether or not this renderer can apply on-GPU film grain synthesis + int supports_gpu_grain; +} Dav1dPlayRenderInfo; + +extern const Dav1dPlayRenderInfo rdr_placebo_vk; +extern const Dav1dPlayRenderInfo rdr_placebo_gl; +extern const Dav1dPlayRenderInfo rdr_sdl; + +// Available renderes ordered by priority +static const Dav1dPlayRenderInfo* const dp_renderers[] = { + &rdr_placebo_vk, + &rdr_placebo_gl, + &rdr_sdl, +}; + +static inline const Dav1dPlayRenderInfo *dp_get_renderer(const char *name) +{ + for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i) + { + if (dp_renderers[i]->name == NULL) + continue; + + if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) { + return dp_renderers[i]; + } + } + return NULL; +} + +static inline SDL_Window *dp_create_sdl_window(int window_flags) +{ + SDL_Window *win; + window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI; + + win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, + WINDOW_WIDTH, WINDOW_HEIGHT, window_flags); + SDL_SetWindowResizable(win, SDL_TRUE); + + return win; +} diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c new file mode 100644 index 00000000000..beb1d42ad72 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_placebo.c @@ -0,0 +1,723 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "dp_renderer.h" + +#ifdef HAVE_RENDERER_PLACEBO +#include <assert.h> + +#include <libplacebo/renderer.h> +#include <libplacebo/utils/upload.h> + +#ifdef HAVE_PLACEBO_VULKAN +# include <libplacebo/vulkan.h> +# include <SDL_vulkan.h> +#endif +#ifdef HAVE_PLACEBO_OPENGL +# include <libplacebo/opengl.h> +# include <SDL_opengl.h> +#endif + + +/** + * Renderer context for libplacebo + */ +typedef struct renderer_priv_ctx +{ + // SDL window + SDL_Window *win; + // Placebo context + struct pl_context *ctx; + // Placebo renderer + struct pl_renderer *renderer; +#ifdef HAVE_PLACEBO_VULKAN + // Placebo Vulkan handle + const struct pl_vulkan *vk; + // Placebo Vulkan instance + const struct pl_vk_inst *vk_inst; + // Vulkan surface + VkSurfaceKHR surf; +#endif +#ifdef HAVE_PLACEBO_OPENGL + // Placebo OpenGL handle + const struct pl_opengl *gl; +#endif + // Placebo GPU + const struct pl_gpu *gpu; + // Placebo swapchain + const struct pl_swapchain *swapchain; + // Lock protecting access to the texture + SDL_mutex *lock; + // Image to render, and planes backing them + struct pl_image image; + const struct pl_tex *plane_tex[3]; +} Dav1dPlayRendererPrivateContext; + +static Dav1dPlayRendererPrivateContext* + placebo_renderer_create_common(int window_flags) +{ + // Create Window + SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE); + if (sdlwin == NULL) + return NULL; + + // Alloc + Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); + if (rd_priv_ctx == NULL) { + return NULL; + } + + *rd_priv_ctx = (Dav1dPlayRendererPrivateContext) {0}; + rd_priv_ctx->win = sdlwin; + + // Init libplacebo + rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) { + .log_cb = pl_log_color, +#ifndef NDEBUG + .log_level = PL_LOG_DEBUG, +#else + .log_level = PL_LOG_WARN, +#endif + }); + if (rd_priv_ctx->ctx == NULL) { + free(rd_priv_ctx); + return NULL; + } + + // Create Mutex + rd_priv_ctx->lock = SDL_CreateMutex(); + if (rd_priv_ctx->lock == NULL) { + fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); + pl_context_destroy(&(rd_priv_ctx->ctx)); + free(rd_priv_ctx); + return NULL; + } + + return rd_priv_ctx; +} + +#ifdef HAVE_PLACEBO_OPENGL +static void *placebo_renderer_create_gl() +{ + SDL_Window *sdlwin = NULL; + SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); + + // Common init + Dav1dPlayRendererPrivateContext *rd_priv_ctx = + placebo_renderer_create_common(SDL_WINDOW_OPENGL); + + if (rd_priv_ctx == NULL) + return NULL; + sdlwin = rd_priv_ctx->win; + + // Init OpenGL + struct pl_opengl_params params = pl_opengl_default_params; +# ifndef NDEBUG + params.debug = true; +# endif + + SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin); + SDL_GL_MakeCurrent(sdlwin, glcontext); + + rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->ctx, ¶ms); + if (!rd_priv_ctx->gl) { + fprintf(stderr, "Failed creating opengl device!\n"); + exit(2); + } + + rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl, + &(struct pl_opengl_swapchain_params) { + .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow, + .priv = sdlwin, + }); + + if (!rd_priv_ctx->swapchain) { + fprintf(stderr, "Failed creating opengl swapchain!\n"); + exit(2); + } + + int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; + SDL_GL_GetDrawableSize(sdlwin, &w, &h); + + if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { + fprintf(stderr, "Failed resizing vulkan swapchain!\n"); + exit(2); + } + + rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu; + + if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) + printf("Note: window dimensions differ (got %dx%d)\n", w, h); + + return rd_priv_ctx; +} +#endif + +#ifdef HAVE_PLACEBO_VULKAN +static void *placebo_renderer_create_vk() +{ + SDL_Window *sdlwin = NULL; + + // Common init + Dav1dPlayRendererPrivateContext *rd_priv_ctx = + placebo_renderer_create_common(SDL_WINDOW_VULKAN); + + if (rd_priv_ctx == NULL) + return NULL; + sdlwin = rd_priv_ctx->win; + + // Init Vulkan + unsigned num = 0; + if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) { + fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError()); + exit(1); + } + + const char **extensions = malloc(num * sizeof(const char *)); + assert(extensions); + + SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions); + if (!ok) { + fprintf(stderr, "Failed getting Vk instance extensions\n"); + exit(1); + } + + if (num > 0) { + printf("Requesting %d additional Vulkan extensions:\n", num); + for (unsigned i = 0; i < num; i++) + printf(" %s\n", extensions[i]); + } + + struct pl_vk_inst_params iparams = pl_vk_inst_default_params; + iparams.extensions = extensions; + iparams.num_extensions = num; + + rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams); + if (!rd_priv_ctx->vk_inst) { + fprintf(stderr, "Failed creating Vulkan instance!\n"); + exit(1); + } + free(extensions); + + if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) { + fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError()); + exit(1); + } + + struct pl_vulkan_params params = pl_vulkan_default_params; + params.instance = rd_priv_ctx->vk_inst->instance; + params.surface = rd_priv_ctx->surf; + params.allow_software = true; + + rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, ¶ms); + if (!rd_priv_ctx->vk) { + fprintf(stderr, "Failed creating vulkan device!\n"); + exit(2); + } + + // Create swapchain + rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk, + &(struct pl_vulkan_swapchain_params) { + .surface = rd_priv_ctx->surf, + .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR, + }); + + if (!rd_priv_ctx->swapchain) { + fprintf(stderr, "Failed creating vulkan swapchain!\n"); + exit(2); + } + + int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; + if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { + fprintf(stderr, "Failed resizing vulkan swapchain!\n"); + exit(2); + } + + rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu; + + if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) + printf("Note: window dimensions differ (got %dx%d)\n", w, h); + + return rd_priv_ctx; +} +#endif + +static void placebo_renderer_destroy(void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + pl_renderer_destroy(&(rd_priv_ctx->renderer)); + pl_swapchain_destroy(&(rd_priv_ctx->swapchain)); + for (int i = 0; i < 3; i++) + pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i])); + +#ifdef HAVE_PLACEBO_VULKAN + if (rd_priv_ctx->vk) { + pl_vulkan_destroy(&(rd_priv_ctx->vk)); + vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); + pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); + } +#endif +#ifdef HAVE_PLACEBO_OPENGL + if (rd_priv_ctx->gl) + pl_opengl_destroy(&(rd_priv_ctx->gl)); +#endif + + SDL_DestroyWindow(rd_priv_ctx->win); + + pl_context_destroy(&(rd_priv_ctx->ctx)); +} + +static void placebo_render(void *cookie, const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + if (!rd_priv_ctx->image.num_planes) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + // Prepare rendering + if (rd_priv_ctx->renderer == NULL) { + rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->gpu); + } + + struct pl_swapchain_frame frame; + bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame); + if (!ok) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + struct pl_render_params render_params = {0}; + if (settings->highquality) + render_params = pl_render_default_params; + + struct pl_render_target target; + pl_render_target_from_swapchain(&target, &frame); + target.profile = (struct pl_icc_profile) { + .data = NULL, + .len = 0, + }; + +#if PL_API_VER >= 66 + pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0); + if (pl_render_target_partial(&target)) + pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 }); +#endif + + if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) { + fprintf(stderr, "Failed rendering frame!\n"); + pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 }); + } + + ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); + if (!ok) { + fprintf(stderr, "Failed submitting frame!\n"); + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + pl_swapchain_swap_buffers(rd_priv_ctx->swapchain); + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (dav1d_pic == NULL) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; + } + + int width = dav1d_pic->p.w; + int height = dav1d_pic->p.h; + int sub_x = 0, sub_y = 0; + int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up + enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN; + + struct pl_image *image = &rd_priv_ctx->image; + *image = (struct pl_image) { + .num_planes = 3, + .width = width, + .height = height, + .src_rect = {0, 0, width, height}, + + .repr = { + .bits = { + .sample_depth = bytes * 8, + .color_depth = dav1d_pic->p.bpc, + }, + }, + }; + + // Figure out the correct plane dimensions/count + switch (dav1d_pic->p.layout) { + case DAV1D_PIXEL_LAYOUT_I400: + image->num_planes = 1; + break; + case DAV1D_PIXEL_LAYOUT_I420: + sub_x = sub_y = 1; + break; + case DAV1D_PIXEL_LAYOUT_I422: + sub_x = 1; + break; + case DAV1D_PIXEL_LAYOUT_I444: + break; + } + + // Set the right colorspace metadata etc. + switch (dav1d_pic->seq_hdr->pri) { + case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break; + case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break; + case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break; + case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; + case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; + case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break; + + case DAV1D_COLOR_PRI_XYZ: + // Handled below + assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY); + break; + + default: + printf("warning: unknown dav1d color primaries %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->pri); + break; + } + + switch (dav1d_pic->seq_hdr->trc) { + case DAV1D_TRC_BT709: + case DAV1D_TRC_BT470M: + case DAV1D_TRC_BT470BG: + case DAV1D_TRC_BT601: + case DAV1D_TRC_SMPTE240: + case DAV1D_TRC_BT2020_10BIT: + case DAV1D_TRC_BT2020_12BIT: + // These all map to the effective "SDR" CRT-based EOTF, BT.1886 + image->color.transfer = PL_COLOR_TRC_BT_1886; + break; + + case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break; + case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break; + case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break; + case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break; + case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break; + + default: + printf("warning: unknown dav1d color transfer %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->trc); + break; + } + + switch (dav1d_pic->seq_hdr->mtrx) { + case DAV1D_MC_IDENTITY: + // This is going to be either RGB or XYZ + if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) { + image->repr.sys = PL_COLOR_SYSTEM_XYZ; + } else { + image->repr.sys = PL_COLOR_SYSTEM_RGB; + } + break; + + case DAV1D_MC_UNKNOWN: + // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one + image->repr.sys = pl_color_system_guess_ycbcr(width, height); + break; + + case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break; + case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break; + case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break; + case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break; + case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break; + case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break; + + case DAV1D_MC_ICTCP: + // This one is split up based on the actual HDR curve in use + if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) { + image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG; + } else { + image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ; + } + break; + + default: + printf("warning: unknown dav1d color matrix %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx); + break; + } + + if (dav1d_pic->seq_hdr->color_range) { + image->repr.levels = PL_COLOR_LEVELS_PC; + } else { + image->repr.levels = PL_COLOR_LEVELS_TV; + } + + switch (dav1d_pic->seq_hdr->chr) { + case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break; + case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break; + case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break; + } + +#if PL_API_VER >= 63 + if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) { + Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data; + struct pl_av1_grain_data *dst = &image->av1_grain; + *dst = (struct pl_av1_grain_data) { + .grain_seed = src->seed, + .num_points_y = src->num_y_points, + .chroma_scaling_from_luma = src->chroma_scaling_from_luma, + .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] }, + .scaling_shift = src->scaling_shift, + .ar_coeff_lag = src->ar_coeff_lag, + .ar_coeff_shift = src->ar_coeff_shift, + .grain_scale_shift = src->grain_scale_shift, + .uv_mult = { src->uv_mult[0], src->uv_mult[1] }, + .uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] }, + .uv_offset = { src->uv_offset[0], src->uv_offset[1] }, + .overlap = src->overlap_flag, + }; + + assert(sizeof(dst->points_y) == sizeof(src->y_points)); + assert(sizeof(dst->points_uv) == sizeof(src->uv_points)); + assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y)); + memcpy(dst->points_y, src->y_points, sizeof(src->y_points)); + memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points)); + memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y)); + + // this one has different row sizes for alignment + for (int c = 0; c < 2; c++) { + for (int i = 0; i < 25; i++) + dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i]; + } + } +#endif + + // Upload the actual planes + struct pl_plane_data data[3] = { + { + // Y plane + .type = PL_FMT_UNORM, + .width = width, + .height = height, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[0], + .component_size = {bytes * 8}, + .component_map = {0}, + }, { + // U plane + .type = PL_FMT_UNORM, + .width = width >> sub_x, + .height = height >> sub_y, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[1], + .component_size = {bytes * 8}, + .component_map = {1}, + }, { + // V plane + .type = PL_FMT_UNORM, + .width = width >> sub_x, + .height = height >> sub_y, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[1], + .component_size = {bytes * 8}, + .component_map = {2}, + }, + }; + + bool ok = true; + + for (int i = 0; i < image->num_planes; i++) { + if (settings->zerocopy) { + const struct pl_buf *buf = dav1d_pic->allocator_data; + assert(buf); + data[i].buf = buf; + data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data; + } else { + data[i].pixels = dav1d_pic->data[i]; + } + + ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]); + } + + // Apply the correct chroma plane shift. This has to be done after pl_upload_plane +#if PL_API_VER >= 67 + pl_image_set_chroma_location(image, chroma_loc); +#else + pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y); + pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y); +#endif + + if (!ok) { + fprintf(stderr, "Failed uploading planes!\n"); + *image = (struct pl_image) {0}; + } + + SDL_UnlockMutex(rd_priv_ctx->lock); + return !ok; +} + +// Align to power of 2 +#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + SDL_LockMutex(rd_priv_ctx->lock); + + const struct pl_gpu *gpu = rd_priv_ctx->gpu; + int ret = DAV1D_ERR(ENOMEM); + + // Copied from dav1d_default_picture_alloc + const int hbd = p->p.bpc > 8; + const int aligned_w = ALIGN2(p->p.w, 128); + const int aligned_h = ALIGN2(p->p.h, 128); + const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + p->stride[0] = aligned_w << hbd; + p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; + + // Align strides up to multiples of the GPU performance hints + p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride); + p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride); + + // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2) + size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4); + const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align); + const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); + + // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, + // even in the case that the driver gives us insane alignments + const size_t pic_size = y_sz + 2 * uv_sz; + const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; + + // Validate size limitations + if (total_size > gpu->limits.max_xfer_size) { + printf("alloc of %zu bytes exceeds limits\n", total_size); + goto err; + } + + const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) { + .type = PL_BUF_TEX_TRANSFER, + .host_mapped = true, + .size = total_size, + .memory_type = PL_BUF_MEM_HOST, + .user_data = p, + }); + + if (!buf) { + printf("alloc of GPU mapped buffer failed\n"); + goto err; + } + + assert(buf->data); + uintptr_t base = (uintptr_t) buf->data, data[3]; + data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); + data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); + data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); + + // Sanity check offset alignment for the sake of debugging + if (data[0] - base != ALIGN2(data[0] - base, off_align) || + data[1] - base != ALIGN2(data[1] - base, off_align) || + data[2] - base != ALIGN2(data[2] - base, off_align)) + { + printf("GPU buffer horribly misaligned, expect slowdown!\n"); + } + + p->allocator_data = (void *) buf; + p->data[0] = (void *) data[0]; + p->data[1] = (void *) data[1]; + p->data[2] = (void *) data[2]; + ret = 0; + + // fall through +err: + SDL_UnlockMutex(rd_priv_ctx->lock); + return ret; +} + +static void placebo_release_pic(Dav1dPicture *pic, void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + assert(pic->allocator_data); + + SDL_LockMutex(rd_priv_ctx->lock); + const struct pl_gpu *gpu = rd_priv_ctx->gpu; + pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data); + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +#ifdef HAVE_PLACEBO_VULKAN +const Dav1dPlayRenderInfo rdr_placebo_vk = { + .name = "placebo-vk", + .create_renderer = placebo_renderer_create_vk, + .destroy_renderer = placebo_renderer_destroy, + .render = placebo_render, + .update_frame = placebo_upload_image, + .alloc_pic = placebo_alloc_pic, + .release_pic = placebo_release_pic, + +# if PL_API_VER >= 63 + .supports_gpu_grain = 1, +# endif +}; +#else +const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; +#endif + +#ifdef HAVE_PLACEBO_OPENGL +const Dav1dPlayRenderInfo rdr_placebo_gl = { + .name = "placebo-gl", + .create_renderer = placebo_renderer_create_gl, + .destroy_renderer = placebo_renderer_destroy, + .render = placebo_render, + .update_frame = placebo_upload_image, + .alloc_pic = placebo_alloc_pic, + .release_pic = placebo_release_pic, + +# if PL_API_VER >= 63 + .supports_gpu_grain = 1, +# endif +}; +#else +const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL }; +#endif + +#else +const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; +const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL }; +#endif diff --git a/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c new file mode 100644 index 00000000000..078d6134921 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/examples/dp_renderer_sdl.c @@ -0,0 +1,164 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "dp_renderer.h" + +#include <assert.h> + +/** + * Renderer context for SDL + */ +typedef struct renderer_priv_ctx +{ + // SDL window + SDL_Window *win; + // SDL renderer + SDL_Renderer *renderer; + // Lock protecting access to the texture + SDL_mutex *lock; + // Texture to render + SDL_Texture *tex; +} Dav1dPlayRendererPrivateContext; + +static void *sdl_renderer_create() +{ + SDL_Window *win = dp_create_sdl_window(0); + if (win == NULL) + return NULL; + + // Alloc + Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); + if (rd_priv_ctx == NULL) { + return NULL; + } + rd_priv_ctx->win = win; + + // Create renderer + rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED); + // Set scale quality + SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear"); + + // Create Mutex + rd_priv_ctx->lock = SDL_CreateMutex(); + if (rd_priv_ctx->lock == NULL) { + fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); + free(rd_priv_ctx); + return NULL; + } + + rd_priv_ctx->tex = NULL; + + return rd_priv_ctx; +} + +static void sdl_renderer_destroy(void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_DestroyRenderer(rd_priv_ctx->renderer); + SDL_DestroyMutex(rd_priv_ctx->lock); + free(rd_priv_ctx); +} + +static void sdl_render(void *cookie, const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (rd_priv_ctx->tex == NULL) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + // Display the frame + SDL_RenderClear(rd_priv_ctx->renderer); + SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL); + SDL_RenderPresent(rd_priv_ctx->renderer); + + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (dav1d_pic == NULL) { + rd_priv_ctx->tex = NULL; + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; + } + + int width = dav1d_pic->p.w; + int height = dav1d_pic->p.h; + int tex_w = width; + int tex_h = height; + + enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; + + if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { + fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); + exit(50); + } + + SDL_Texture *texture = rd_priv_ctx->tex; + if (texture != NULL) { + SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h); + if (tex_w != width || tex_h != height) { + SDL_DestroyTexture(texture); + texture = NULL; + } + } + + if (texture == NULL) { + texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, + SDL_TEXTUREACCESS_STREAMING, width, height); + } + + SDL_UpdateYUVTexture(texture, NULL, + dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y + dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U + dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V + ); + + rd_priv_ctx->tex = texture; + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; +} + +const Dav1dPlayRenderInfo rdr_sdl = { + .name = "sdl", + .create_renderer = sdl_renderer_create, + .destroy_renderer = sdl_renderer_destroy, + .render = sdl_render, + .update_frame = sdl_update_texture +}; diff --git a/chromium/third_party/dav1d/libdav1d/examples/meson.build b/chromium/third_party/dav1d/libdav1d/examples/meson.build index bad1d902ed3..50e097a8df6 100644 --- a/chromium/third_party/dav1d/libdav1d/examples/meson.build +++ b/chromium/third_party/dav1d/libdav1d/examples/meson.build @@ -35,28 +35,40 @@ endif # dav1d player sources dav1dplay_sources = files( 'dav1dplay.c', + 'dp_fifo.c', + 'dp_renderer_placebo.c', + 'dp_renderer_sdl.c', ) sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true) if sdl2_dependency.found() + dav1dplay_deps = [sdl2_dependency] + dav1dplay_cflags = [] + placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false) - vulkan_dependency = dependency('vulkan', required: false) - sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency]) - cflag_placebo = [] - deps_placebo = [] - if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan - cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1' - deps_placebo = [vulkan_dependency, placebo_dependency] + + if placebo_dependency.found() + dav1dplay_deps += placebo_dependency + dav1dplay_cflags += '-DHAVE_PLACEBO' + + # If libplacebo is found, we might be able to use Vulkan + # with it, in which case we need the Vulkan library too. + vulkan_dependency = dependency('vulkan', required: false) + if vulkan_dependency.found() + dav1dplay_deps += vulkan_dependency + dav1dplay_cflags += '-DHAVE_VULKAN' + endif endif + dav1dplay = executable('dav1dplay', dav1dplay_sources, rev_target, link_with : [libdav1d, dav1d_input_objs], include_directories : [dav1d_inc_dirs], - dependencies : [getopt_dependency, sdl2_dependency, deps_placebo], + dependencies : [getopt_dependency, dav1dplay_deps], install : true, - c_args : cflag_placebo, + c_args : dav1dplay_cflags, ) endif diff --git a/chromium/third_party/dav1d/libdav1d/gcovr.cfg b/chromium/third_party/dav1d/libdav1d/gcovr.cfg new file mode 100644 index 00000000000..f768de8a656 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/gcovr.cfg @@ -0,0 +1,3 @@ +exclude = .*/tests/.* +exclude = .*/tools/.* +exclude = .*/include/common/dump.h diff --git a/chromium/third_party/dav1d/libdav1d/meson.build b/chromium/third_party/dav1d/libdav1d/meson.build index b575601e556..d5366f9a7c4 100644 --- a/chromium/third_party/dav1d/libdav1d/meson.build +++ b/chromium/third_party/dav1d/libdav1d/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2018-2019, VideoLAN and dav1d authors +# Copyright © 2018-2020, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.6.0', + version: '0.7.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.47.0') -dav1d_soname_version = '4.0.0' +dav1d_soname_version = '4.0.2' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -196,10 +196,10 @@ else getopt_dependency = [] endif -if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args) - cdata.set('HAVE_POSIX_MEMALIGN', 1) -elif cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args) +if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args) cdata.set('HAVE_ALIGNED_MALLOC', 1) +elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args) + cdata.set('HAVE_POSIX_MEMALIGN', 1) elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args) cdata.set('HAVE_MEMALIGN', 1) endif @@ -415,7 +415,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') depfile: '@BASENAME@.obj.ndep', arguments: [ '-f', nasm_format, - '-I', '@0@/src/'.format(meson.current_source_dir()), + '-I', '@0@/src/'.format(dav1d_src_root), '-I', '@0@/'.format(meson.current_build_dir()), '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@', '@EXTRA_ARGS@', diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S new file mode 100644 index 00000000000..867eb194df9 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx.S @@ -0,0 +1,3386 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro vmull_vmlal d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlal.s16 \d1, \s3, \c1 +.endm + +.macro vmull_vmlsl d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlsl.s16 \d1, \s3, \c1 +.endm + +.macro vrshrn_8h d0, d1, s0, s1, shift + vrshrn.i32 \d0, \s0, \shift + vrshrn.i32 \d1, \s1, \shift +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s16 \r0, \r0, \c + vqrdmulh.s16 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s16 \r2, \r2, \c + vqrdmulh.s16 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s16 \r4, \r4, \c + vqrdmulh.s16 \r5, \r5, \c + vqrdmulh.s16 \r6, \r6, \c + vqrdmulh.s16 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + vld1.8 {\load}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.8 {\store}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src, \shiftbits + load_add_store d3, q9, , , , , , \dst, \src, \shiftbits + load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits + load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits + load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits + load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits + load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits + load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits + load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits + load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits + load_add_store , , , , , , d3, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src + load_add_store d3, q9, , , , , , \dst, \src + load_add_store d4, q10, d2, q8, , , , \dst, \src + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src + load_add_store , , d4, q10, q9, d3, d2, \dst, \src + load_add_store , , d5, q11, q10, d4, d3, \dst, \src + load_add_store , , , , q11, d5, d4, \dst, \src + load_add_store , , , , , , d5, \dst, \src +.endm +.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + vld1.32 {\load[0]}, [\src, :32], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #4 +.endif +.ifnb \load + vld1.32 {\load[1]}, [\src, :32], r1 +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \store + vst1.32 {\store[0]}, [\dst, :32], r1 +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.32 {\store[1]}, [\dst, :32], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src + load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src + load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src + load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src + load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src + load_add_store4 , , , , q15, d7, d6, \dst, \src + load_add_store4 , , , , , , d7, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src + load_add_store4 , , , , q11, d3, d2, \dst, \src + load_add_store4 , , , , , , d3, \dst, \src +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d0, r12 + vqrdmulh.s16 d16, d16, d0[0] + vst1.16 {d30[0]}, [r2, :16] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s16 d16, d16, d0[0] +.endif +.if \shift > 0 + vrshr.s16 d16, d16, #\shift +.endif + vqrdmulh.s16 d20, d16, d0[0] + mov r3, #\h + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + vld1.32 {d0[0]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[0]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + subs r3, r3, #4 + sub r0, r0, r1, lsl #2 + vaddw.u8 q10, q8, d0 + vqmovun.s16 d0, q10 + vaddw.u8 q11, q8, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q11 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon +1: + vld1.8 {d0}, [r0, :64], r1 + vld1.8 {d1}, [r0, :64], r1 + vld1.8 {d2}, [r0, :64], r1 + vaddw.u8 q10, q8, d0 + vld1.8 {d3}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + subs r3, r3, #4 + vaddw.u8 q11, q8, d1 + vqmovun.s16 d0, q10 + vaddw.u8 q12, q8, d2 + vqmovun.s16 d1, q11 + vaddw.u8 q13, q8, d3 + vst1.8 {d0}, [r0, :64], r1 + vqmovun.s16 d2, q12 + vst1.8 {d1}, [r0, :64], r1 + vqmovun.s16 d3, q13 + vst1.8 {d2}, [r0, :64], r1 + vst1.8 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon +1: + vld1.8 {q0}, [r0, :128], r1 + vld1.8 {q1}, [r0, :128], r1 + vld1.8 {q2}, [r0, :128], r1 + subs r3, r3, #4 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vld1.8 {q3}, [r0, :128], r1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #2 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q1}, [r0, :128], r1 + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon +1: + vld1.8 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.8 {q2, q3}, [r0, :128], r1 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #1 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #32 +1: + vld1.8 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.8 {q2, q3}, [r0, :128] + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, #32 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128]! + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i16 d16, d16, d17 + vsub.i16 d21, d18, d19 + vsub.i16 d20, d16, d21 + vshr.s16 d20, d20, #1 + vsub.i16 d18, d20, d17 + vsub.i16 d17, d20, d19 + vadd.i16 d19, d21, d18 + vsub.i16 d16, d16, d17 +.endm + +.macro idct_4h_x4 r0, r1, r2, r3 + vmull_vmlal q3, \r1, \r3, d0[3], d0[2] + vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] + vmull_vmlal q1, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d6, q3, #12 + vrshrn.i32 d7, q2, #12 + vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d2, q1, #12 + vrshrn.i32 d3, q2, #12 + vqadd.s16 \r0, d2, d6 + vqsub.s16 \r3, d2, d6 + vqadd.s16 \r1, d3, d7 + vqsub.s16 \r2, d3, d7 +.endm + +.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] + vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] + vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d12, d13, q6, q7, #12 + vrshrn_8h d14, d15, q4, q5, #12 + vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d4, d5, q2, q3, #12 + vrshrn_8h d6, d7, q4, q5, #12 + vqadd.s16 \q0, q2, q6 + vqsub.s16 \q3, q2, q6 + vqadd.s16 \q1, q3, q7 + vqsub.s16 \q2, q3, q7 +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_4h_x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q1, d16, d18 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmull.s16 q10, d17, d0[3] + vaddw.s16 q1, q1, d19 + vmull.s16 q3, d16, d0[2] + vmlsl.s16 q3, d18, d0[0] + vmlsl.s16 q3, d19, d0[1] + + vadd.s32 q11, q2, q3 + vmul.s32 q1, q1, d1[0] + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q10 + vsub.s32 q11, q11, q10 + + vrshrn.i32 \o0, q2, #12 + vrshrn.i32 \o2, q1, #12 + vrshrn.i32 \o1, q3, #12 + vrshrn.i32 \o3, q11, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 d19, d18, d17, d16 + bx lr +endfunc + +.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q2, d16, d20 + vsubl.s16 q3, d17, d21 + vmull.s16 q4, d16, d0[0] + vmlal.s16 q4, d20, d0[1] + vmlal.s16 q4, d22, d0[2] + vmull.s16 q5, d17, d0[0] + vmlal.s16 q5, d21, d0[1] + vmlal.s16 q5, d23, d0[2] + vaddw.s16 q2, q2, d22 + vaddw.s16 q3, q3, d23 + vmull.s16 q6, d16, d0[2] + vmlsl.s16 q6, d20, d0[0] + vmlsl.s16 q6, d22, d0[1] + vmull.s16 q7, d17, d0[2] + vmlsl.s16 q7, d21, d0[0] + vmlsl.s16 q7, d23, d0[1] + + vmul.s32 q10, q2, d1[0] + vmul.s32 q11, q3, d1[0] + + vmull.s16 q2, d18, d0[3] + vmull.s16 q3, d19, d0[3] + + vadd.s32 q8, q4, q2 // out0 + vadd.s32 q9, q5, q3 + + vadd.s32 q4, q4, q6 // out3 + vadd.s32 q5, q5, q7 + + vadd.s32 q6, q6, q2 // out1 + vadd.s32 q7, q7, q3 + + vsub.s32 q4, q4, q2 // out3 + vsub.s32 q5, q5, q3 + + vrshrn.i32 d20, q10, #12 + vrshrn.i32 d21, q11, #12 + + vrshrn.i32 \o0, q8, #12 + vrshrn.i32 \o1, q9, #12 + +.ifc \o4, d18 + vmov q9, q10 +.endif + + vrshrn.i32 \o2, q6, #12 + vrshrn.i32 \o3, q7, #12 + + vrshrn.i32 \o6, q4, #12 + vrshrn.i32 \o7, q5, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_identity_4h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q2, q8, d0[0] + vqrdmulh.s16 q3, q9, d0[0] + vqadd.s16 q8, q8, q2 + vqadd.s16 q9, q9, q3 + bx lr +endfunc + +function inv_identity_8h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q1, q8, d0[0] + vqrdmulh.s16 q2, q9, d0[0] + vqrdmulh.s16 q3, q10, d0[0] + vqadd.s16 q8, q8, q1 + vqrdmulh.s16 q1, q11, d0[0] + vqadd.s16 q9, q9, q2 + vqadd.s16 q10, q10, q3 + vqadd.s16 q11, q11, q1 + bx lr +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0, \r1, \r2, \r3 + vqrdmulh.s16 q1, \i, \c + vrhadd.s16 \i, \i, q1 +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + vshr.s16 q8, q8, #2 + vshr.s16 q9, q9, #2 + + iwht4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + iwht4 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + blx r4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + +L(itx_4x4_end): + sub r0, r0, r1, lsl #2 + vaddw.u8 q8, q8, d0 + vqmovun.s16 d0, q8 + vaddw.u8 q9, q9, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q9 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d4, r12 + vst1.16 {d30[0]}, [r2, :16] + vqrdmulh.s16 d16, d16, d4[0] + vld1.32 {d0[0]}, [r0, :32], r1 + vqrdmulh.s16 d20, d16, d4[0] + vld1.32 {d0[1]}, [r0, :32], r1 + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + vld1.32 {d1[0]}, [r0, :32], r1 + vmov q9, q8 + vld1.32 {d1[1]}, [r0, :32], r1 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4h_x4_neon + movrel_local r5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 + + vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a + vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a + vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a + vrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a + vrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vrshrn_8h \r10, \r11, q2, q3, #12 // taa + + vqadd.s16 q2, \q1, \q3 // t4 + vqsub.s16 \q1, \q1, \q3 // t5a + vqadd.s16 q3, \q7, \q5 // t7 + vqsub.s16 \q3, \q7, \q5 // t6a + + vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 + vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 + vrshrn_8h d8, d9, q4, q5, #12 // t5 + vrshrn_8h d10, d11, q6, q7, #12 // t6 + + vqsub.s16 \q7, \q0, q3 // out7 + vqadd.s16 \q0, \q0, q3 // out0 + vqadd.s16 \q1, \q2, q5 // out1 + vqsub.s16 q6, \q2, q5 // out6 + vqadd.s16 \q2, \q4, q4 // out2 + vqsub.s16 \q5, \q4, q4 // out5 + vqadd.s16 \q3, \q6, q2 // out3 + vqsub.s16 \q4, \q6, q2 // out4 + vmov \q6, q6 // out6 +.endm + +.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4h_x4 \r0, \r2, \r4, \r6 + + vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a + vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a + vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a + vrshrn.i32 \r1, q1, #12 // t4a + vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a + vrshrn.i32 \r7, q2, #12 // t7a + vrshrn.i32 \r3, q3, #12 // t5a + vrshrn.i32 \r5, q1, #12 // taa + + vqadd.s16 d2, \r1, \r3 // t4 + vqsub.s16 \r1, \r1, \r3 // t5a + vqadd.s16 d3, \r7, \r5 // t7 + vqsub.s16 \r3, \r7, \r5 // t6a + + vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 + vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 + vrshrn.i32 d4, q2, #12 // t5 + vrshrn.i32 d5, q3, #12 // t6 + + vqsub.s16 \r7, \r0, d3 // out7 + vqadd.s16 \r0, \r0, d3 // out0 + vqadd.s16 \r1, \r2, d5 // out1 + vqsub.s16 d6, \r2, d5 // out6 + vqadd.s16 \r2, \r4, d4 // out2 + vqsub.s16 \r5, \r4, d4 // out5 + vqadd.s16 \r3, \r6, d2 // out3 + vqsub.s16 \r4, \r6, d2 // out4 + vmov \r6, d6 // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] + vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] + vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] + vrshrn_8h d16, d17, q2, q3, #12 // t0a + vrshrn_8h d30, d31, q4, q5, #12 // t1a + vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] + vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] + vrshrn_8h d20, d21, q6, q7, #12 // t2a + vrshrn_8h d26, d27, q2, q3, #12 // t3a + vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] + vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] + vrshrn_8h d24, d25, q4, q5, #12 // t4a + vrshrn_8h d22, d23, q6, q7, #12 // t5a + vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] + vrshrn_8h d28, d29, q2, q3, #12 // t6a + vrshrn_8h d18, d19, q4, q5, #12 // t7a + + vqadd.s16 q2, q8, q12 // t0 + vqsub.s16 q3, q8, q12 // t4 + vqadd.s16 q4, q15, q11 // t1 + vqsub.s16 q5, q15, q11 // t5 + vqadd.s16 q6, q10, q14 // t2 + vqsub.s16 q7, q10, q14 // t6 + vqadd.s16 q10, q13, q9 // t3 + vqsub.s16 q11, q13, q9 // t7 + + vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] + vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] + + vrshrn_8h d6, d7, q8, q9, #12 // t4a + vrshrn_8h d10, d11, q12, q13, #12 // t5a + + vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] + + vrshrn_8h d14, d15, q14, q15, #12 // t6a + vrshrn_8h d22, d23, q8, q9, #12 // t7a + + vqadd.s16 \q0, q2, q6 // out0 + vqsub.s16 q2, q2, q6 // t2 + vqadd.s16 \q7, q4, q10 // out7 + vqsub.s16 q4, q4, q10 // t3 + vqneg.s16 \q7, \q7 // out7 + + vqadd.s16 \q1, q3, q7 // out1 + vqsub.s16 q3, q3, q7 // t6 + vqadd.s16 \q6, q5, q11 // out6 + vqsub.s16 q5, q5, q11 // t7 + vqneg.s16 \q1, \q1 // out1 + + vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) + vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) + vrshrn_8h d4, d5, q10, q11, #12 // out3 + vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) + vrshrn_8h d6, d7, q12, q13, #12 // out5 + vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + + vqneg.s16 \q3, q2 // out3 + vqneg.s16 \q5, q3 // out5 +.endm + +.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal q2, d23, d16, d0[0], d0[1] + vmull_vmlsl q3, d23, d16, d0[1], d0[0] + vmull_vmlal q4, d21, d18, d0[2], d0[3] + vrshrn.i32 d16, q2, #12 // t0a + vrshrn.i32 d23, q3, #12 // t1a + vmull_vmlsl q5, d21, d18, d0[3], d0[2] + vmull_vmlal q6, d19, d20, d1[0], d1[1] + vrshrn.i32 d18, q4, #12 // t2a + vrshrn.i32 d21, q5, #12 // t3a + vmull_vmlsl q7, d19, d20, d1[1], d1[0] + vmull_vmlal q2, d17, d22, d1[2], d1[3] + vrshrn.i32 d20, q6, #12 // t4a + vrshrn.i32 d19, q7, #12 // t5a + vmull_vmlsl q3, d17, d22, d1[3], d1[2] + vrshrn.i32 d22, q2, #12 // t6a + vrshrn.i32 d17, q3, #12 // t7a + + vqadd.s16 d4, d16, d20 // t0 + vqsub.s16 d5, d16, d20 // t4 + vqadd.s16 d6, d23, d19 // t1 + vqsub.s16 d7, d23, d19 // t5 + vqadd.s16 d8, d18, d22 // t2 + vqsub.s16 d9, d18, d22 // t6 + vqadd.s16 d18, d21, d17 // t3 + vqsub.s16 d19, d21, d17 // t7 + + vmull_vmlal q8, d5, d7, d2[3], d2[2] + vmull_vmlsl q10, d5, d7, d2[2], d2[3] + vmull_vmlsl q11, d19, d9, d2[3], d2[2] + + vrshrn.i32 d5, q8, #12 // t4a + vrshrn.i32 d7, q10, #12 // t5a + + vmull_vmlal q8, d19, d9, d2[2], d2[3] + + vrshrn.i32 d9, q11, #12 // t6a + vrshrn.i32 d19, q8, #12 // t7a + + vqadd.s16 \r0, d4, d8 // out0 + vqsub.s16 d4, d4, d8 // t2 + vqadd.s16 \r7, d6, d18 // out7 + vqsub.s16 d6, d6, d18 // t3 + vqneg.s16 \r7, \r7 // out7 + + vqadd.s16 \r1, d5, d9 // out1 + vqsub.s16 d5, d5, d9 // t6 + vqadd.s16 \r6, d7, d19 // out6 + vqsub.s16 d7, d7, d19 // t7 + vqneg.s16 \r1, \r1 // out1 + + vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) + vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) + vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) + vrshrn.i32 d4, q9, #12 // out3 + vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) + vrshrn.i32 d5, q10, #12 // out5 + vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21) + vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19) + + vqneg.s16 \r3, d4 // out3 + vqneg.s16 \r5, d5 // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_8h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + vqshl.s16 q12, q12, #1 + vqshl.s16 q13, q13, #1 + vqshl.s16 q14, q14, #1 + vqshl.s16 q15, q15, #1 + bx lr +endfunc + +function inv_identity_4h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + bx lr +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blx r4 + + vrshr.s16 q8, q8, #1 + vrshr.s16 q9, q9, #1 + vrshr.s16 q10, q10, #1 + vrshr.s16 q11, q11, #1 + vrshr.s16 q12, q12, #1 + vrshr.s16 q13, q13, #1 + vrshr.s16 q14, q14, #1 + vrshr.s16 q15, q15, #1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + movrel_local r4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +function inv_txfm_add_4x8_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x8h q8, q9, q10, q11 + vswp d17, d20 + vswp d19, d21 + vswp d17, d18 + vswp d19, d22 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0, q1}, [r12, :128] + + vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a + vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a + vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a + vrshrn.i32 d17, q2, #12 // t8a + vrshrn.i32 d31, q3, #12 // t15a + vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a + vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a + vrshrn.i32 d23, q4, #12 // t9a + vrshrn.i32 d25, q2, #12 // t14a + vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a + vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a + vrshrn.i32 d21, q3, #12 // t10a + vrshrn.i32 d27, q4, #12 // t13a + vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a + vrshrn.i32 d19, q2, #12 // t11a + vrshrn.i32 d29, q3, #12 // t12a + + idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vqsub.s16 d4, d17, d23 // t9 + vqadd.s16 d17, d17, d23 // t8 + vqsub.s16 d5, d31, d25 // t14 + vqadd.s16 d31, d31, d25 // t15 + vqsub.s16 d23, d19, d21 // t10 + vqadd.s16 d19, d19, d21 // t11 + vqadd.s16 d25, d29, d27 // t12 + vqsub.s16 d29, d29, d27 // t13 + + vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a + vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a + vrshrn.i32 d21, q3, #12 // t9a + vrshrn.i32 d27, q4, #12 // t14a + + vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a + vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a + vrshrn.i32 d29, q3, #12 // t13a + vneg.s32 q4, q4 + vrshrn.i32 d23, q4, #12 // t10a + + vqsub.s16 d4, d17, d19 // t11a + vqadd.s16 d17, d17, d19 // t8a + vqsub.s16 d5, d31, d25 // t12a + vqadd.s16 d31, d31, d25 // t15a + vqadd.s16 d19, d21, d23 // t9 + vqsub.s16 d21, d21, d23 // t10 + vqsub.s16 d25, d27, d29 // t13 + vqadd.s16 d27, d27, d29 // t14 + + vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 + vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 + vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a + + vrshrn.i32 d6, q3, #12 // t11 + vrshrn.i32 d7, q4, #12 // t12 + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a + vrshrn.i32 d4, q2, #12 // t10a + vrshrn.i32 d5, q4, #12 // t13a + + vqadd.s16 d8, d16, d31 // out0 + vqsub.s16 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s16 d23, d30, d17 // out7 + vqsub.s16 d9, d30, d17 // out8 + vqadd.s16 d17, d18, d27 // out1 + vqsub.s16 d30, d18, d27 // out14 + vqadd.s16 d18, d20, d5 // out2 + vqsub.s16 d29, d20, d5 // out13 + vqadd.s16 d5, d28, d19 // out6 + vqsub.s16 d25, d28, d19 // out9 + vqadd.s16 d19, d22, d7 // out3 + vqsub.s16 d28, d22, d7 // out12 + vqadd.s16 d20, d24, d6 // out4 + vqsub.s16 d27, d24, d6 // out11 + vqadd.s16 d21, d26, d4 // out5 + vqsub.s16 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.16 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + + vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 + vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 + vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 + vrshrn.i32 d16, q2, #12 // t0 + vrshrn.i32 d31, q3, #12 // t1 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 + vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 + vrshrn.i32 d18, q4, #12 // t2 + vrshrn.i32 d29, q2, #12 // t3 + vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 + vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 + vrshrn.i32 d20, q3, #12 // t4 + vrshrn.i32 d27, q4, #12 // t5 + vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 + vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 + vrshrn.i32 d22, q2, #12 // t6 + vrshrn.i32 d25, q3, #12 // t7 + vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 + vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 + vrshrn.i32 d23, q4, #12 // t8 + vrshrn.i32 d24, q2, #12 // t9 + vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 + vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 + vrshrn.i32 d21, q3, #12 // t10 + vrshrn.i32 d26, q4, #12 // t11 + vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 + vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 + vrshrn.i32 d19, q2, #12 // t12 + vrshrn.i32 d28, q3, #12 // t13 + vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 + vrshrn.i32 d17, q4, #12 // t14 + vrshrn.i32 d30, q2, #12 // t15 + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d23 // t8a + vqadd.s16 d16, d16, d23 // t0a + vqsub.s16 d3, d31, d24 // t9a + vqadd.s16 d31, d31, d24 // t1a + vqadd.s16 d23, d18, d21 // t2a + vqsub.s16 d18, d18, d21 // t10a + vqadd.s16 d24, d29, d26 // t3a + vqsub.s16 d29, d29, d26 // t11a + vqadd.s16 d21, d20, d19 // t4a + vqsub.s16 d20, d20, d19 // t12a + vqadd.s16 d26, d27, d28 // t5a + vqsub.s16 d27, d27, d28 // t13a + vqadd.s16 d19, d22, d17 // t6a + vqsub.s16 d22, d22, d17 // t14a + vqadd.s16 d28, d25, d30 // t7a + vqsub.s16 d25, d25, d30 // t15a + + vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 + vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 + vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 + vrshrn.i32 d17, q2, #12 // t8 + vrshrn.i32 d30, q3, #12 // t9 + vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 + vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 + vrshrn.i32 d18, q4, #12 // t10 + vrshrn.i32 d29, q2, #12 // t11 + vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 + vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 + vrshrn.i32 d27, q3, #12 // t12 + vrshrn.i32 d20, q4, #12 // t13 + vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 + vrshrn.i32 d25, q2, #12 // t14 + vrshrn.i32 d22, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t4 + vqadd.s16 d16, d16, d21 // t0 + vqsub.s16 d3, d31, d26 // t5 + vqadd.s16 d31, d31, d26 // t1 + vqadd.s16 d21, d23, d19 // t2 + vqsub.s16 d23, d23, d19 // t6 + vqadd.s16 d26, d24, d28 // t3 + vqsub.s16 d24, d24, d28 // t7 + vqadd.s16 d19, d17, d27 // t8a + vqsub.s16 d17, d17, d27 // t12a + vqadd.s16 d28, d30, d20 // t9a + vqsub.s16 d30, d30, d20 // t13a + vqadd.s16 d27, d18, d25 // t10a + vqsub.s16 d18, d18, d25 // t14a + vqadd.s16 d20, d29, d22 // t11a + vqsub.s16 d29, d29, d22 // t15a + + vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a + vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a + vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a + vrshrn.i32 d22, q2, #12 // t4a + vrshrn.i32 d25, q3, #12 // t5a + vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a + vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 + vrshrn.i32 d24, q4, #12 // t6a + vrshrn.i32 d23, q2, #12 // t7a + vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 + vrshrn.i32 d17, q3, #12 // t12 + vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 + vrshrn.i32 d29, q4, #12 // t13 + vrshrn.i32 d30, q2, #12 // t14 + vrshrn.i32 d18, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s16 \o0, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 +.else + vqadd.s16 d4, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s16 \o15, \o15 // out15 + + vqsub.s16 d3, d29, d18 // t15a + vqadd.s16 \o13,d29, d18 // out13 + vqadd.s16 \o2, d17, d30 // out2 + vqsub.s16 d26, d17, d30 // t14a + vqneg.s16 \o13,\o13 // out13 + + vqadd.s16 \o1, d19, d27 // out1 + vqsub.s16 d27, d19, d27 // t10 + vqadd.s16 \o14,d28, d20 // out14 + vqsub.s16 d20, d28, d20 // t11 + vqneg.s16 \o1, \o1 // out1 + + vqadd.s16 \o3, d22, d24 // out3 + vqsub.s16 d22, d22, d24 // t6 + vqadd.s16 \o12,d25, d23 // out12 + vqsub.s16 d23, d25, d23 // t7 + vqneg.s16 \o3, \o3 // out3 + + vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshrn.i32 d24, q12, #12 // out8 + vrshrn.i32 d4, q2, #12 // out7 + vrshrn.i32 d5, q3, #12 // out5 + vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshrn.i32 d26, q4, #12 // out10 + + vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshrn.i32 \o4, q1, #12 // out4 + vrshrn.i32 d7, q3, #12 // out9 + vrshrn.i32 d6, q4, #12 // out11 + vrshrn.i32 \o6, q11, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s16 \o7, d4 // out7 + vqneg.s16 \o5, d5 // out5 + vqneg.s16 \o11,d6 // out11 + vqneg.s16 \o9, d7 // out9 +.endm + +function inv_adst_4h_x16_neon, export=1 + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_4h_x16_neon, export=1 + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q1, \i, d0[0] + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_4x16_shift2 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vshr.s16 q2, q2, #1 + vrhadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_4x16_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vrshr.s16 q2, q2, #1 + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_8x8_shift1 c + identity_4x16_shift1 \c +.endm + +.macro identity_8x8 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + push {lr} + vmov.i16 d7, #0 +.if \identity + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.endif +.if \scale + movw r12, #2896*8 + vdup.16 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif +.if \identity +.if \shift == -2 + identity_4x16_shift2 d0[0] +.else + identity_4x16_shift1 d0[0] +.endif +.else + blx r4 +.endif +.if \shift > 0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #\shift +.endr +.endif + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 + vst1.16 {\i}, [r6, :64]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + blx r9 +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4} +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_identity_16x4_neon +.else + movrel_local r9, inv_txfm_horz_16x4_neon + movrel_local r4, inv_\txfm1\()_4h_x16_neon +.endif + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + +.ifc \variant, identity_ + vmov.i16 d4, #0 +.irp i, d16, d18, d20, d22 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d17, d19, d21, d23 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, d24, d26, d28, d30 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d25, d27, d29, d31 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + + identity_4x16_shift1 d0[0] +.else + vmov.i16 q2, #0 + vmov.i16 q3, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d24, d25, d26, d27}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d28, d29, d30, d31}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + + blx r4 + + vswp d17, d20 + vswp d19, d22 + vswp d18, d20 + vswp d19, d21 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + +.ifc \variant, identity_ + vmov q8, q12 + vmov q9, q13 + vmov q10, q14 + vmov q11, q15 +.else + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 + vrshr.s16 q8, q12, #1 + vrshr.s16 q9, q13, #1 + vrshr.s16 q10, q14, #1 + vrshr.s16 q11, q15, #1 +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + add r6, r0, #8 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_4x16_neon + vmov.i16 q2, #0 + + mov r11, #32 + cmp r3, r10 + blt 1f + + add r6, r2, #16 +.ifc \variant, identity_ +.irp i, q12, q13, q14, q15 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q12, q13, q14, q15, d0[0] +.else +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + blx r4 + vrshr.s16 q12, q8, #1 + vrshr.s16 q13, q9, #1 + vrshr.s16 q14, q10, #1 + vrshr.s16 q15, q11, #1 +.endif + transpose_4x8h q12, q13, q14, q15 + vswp d27, d29 + vswp d26, d28 + vswp d27, d30 + vswp d25, d28 + + b 2f +1: +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +2: + vmov.i16 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr +.ifc \variant, identity_ + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q8, q9, q10, q11, d0[0] +.else + blx r4 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + vswp d19, d21 + vswp d18, d20 + vswp d19, d22 + vswp d17, d20 + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon + mov r10, #\eob_half +.else + movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + sub_sp_align 256 + +.irp i, 0, 4 + add r6, sp, #(\i*16*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #8*2 + blx r9 +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_8x16_neon + sub_sp_align 256 + +.irp i, 0, 8 + add r6, sp, #(\i*8*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + + vmov.i16 q2, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128] + vst1.16 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.ifc \variant, identity_ + // The identity shl #1 and downshift vrshr #1 cancel out +.else + blx r4 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \j, \j, #1 +.endr +.endif + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + vst1.16 {q8, q9}, [r6, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_8h_x8_neon + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.else +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon +.else + movrel_local r4, inv_\txfm1\()_4h_x16_neon + movrel_local r9, inv_txfm_horz_scale_16x4_neon +.endif + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.endif +.if \w == 8 + mov r10, #\eob_8x8 +.else + mov r10, #\eob_4x4 +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43, 10 +def_fn_816 \w, \h, identity, identity, 43, 10 +def_fn_816 \w, \h, dct, adst, 43, 10 +def_fn_816 \w, \h, dct, flipadst, 43, 10 +def_fn_816 \w, \h, dct, identity, 8, 4 +def_fn_816 \w, \h, adst, dct, 43, 10 +def_fn_816 \w, \h, adst, adst, 43, 10 +def_fn_816 \w, \h, adst, flipadst, 43, 10 +def_fn_816 \w, \h, flipadst, dct, 43, 10 +def_fn_816 \w, \h, flipadst, adst, 43, 10 +def_fn_816 \w, \h, flipadst, flipadst, 43, 10 +def_fn_816 \w, \h, identity, dct, 64, 4 +def_fn_816 \w, \h, adst, identity, 8, 4 +def_fn_816 \w, \h, flipadst, identity, 8, 4 +def_fn_816 \w, \h, identity, adst, 64, 4 +def_fn_816 \w, \h, identity, flipadst, 64, 4 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs, 2*16 + vld1.16 {q0, q1}, [r12, :128] + sub r12, r12, #2*16 + + vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a + vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a + vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a + vrshrn.i32 d16, q2, #12 // t16a + vrshrn.i32 d31, q3, #12 // t31a + vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a + vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a + vrshrn.i32 d24, q4, #12 // t17a + vrshrn.i32 d23, q2, #12 // t30a + vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a + vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a + vrshrn.i32 d20, q3, #12 // t18a + vrshrn.i32 d27, q4, #12 // t29a + vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a + vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a + vrshrn.i32 d28, q2, #12 // t19a + vrshrn.i32 d19, q3, #12 // t28a + vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a + vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a + vrshrn.i32 d18, q4, #12 // t20a + vrshrn.i32 d29, q2, #12 // t27a + vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a + vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a + vrshrn.i32 d26, q3, #12 // t21a + vrshrn.i32 d21, q4, #12 // t26a + vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a + vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a + vrshrn.i32 d22, q2, #12 // t22a + vrshrn.i32 d25, q3, #12 // t25a + vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a + vrshrn.i32 d30, q4, #12 // t23a + vrshrn.i32 d17, q2, #12 // t24a + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d24 // t17 + vqadd.s16 d16, d16, d24 // t16 + vqsub.s16 d3, d31, d23 // t30 + vqadd.s16 d31, d31, d23 // t31 + vqsub.s16 d24, d28, d20 // t18 + vqadd.s16 d28, d28, d20 // t19 + vqadd.s16 d23, d18, d26 // t20 + vqsub.s16 d18, d18, d26 // t21 + vqsub.s16 d20, d30, d22 // t22 + vqadd.s16 d30, d30, d22 // t23 + vqadd.s16 d26, d17, d25 // t24 + vqsub.s16 d17, d17, d25 // t25 + vqsub.s16 d22, d29, d21 // t26 + vqadd.s16 d29, d29, d21 // t27 + vqadd.s16 d25, d19, d27 // t28 + vqsub.s16 d19, d19, d27 // t29 + + vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a + vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a + vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a + vrshrn.i32 d21, q2, #12 // t17a + vrshrn.i32 d27, q3, #12 // t30a + vneg.s32 q4, q4 // -> t18a + vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a + vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a + vrshrn.i32 d19, q4, #12 // t18a + vrshrn.i32 d24, q1, #12 // t29a + vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a + vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a + vrshrn.i32 d22, q2, #12 // t21a + vrshrn.i32 d18, q3, #12 // t26a + vneg.s32 q4, q4 // -> t22a + vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a + vrshrn.i32 d17, q4, #12 // t22a + vrshrn.i32 d20, q1, #12 // t25a + + vqsub.s16 d2, d27, d24 // t29 + vqadd.s16 d27, d27, d24 // t30 + vqsub.s16 d3, d21, d19 // t18 + vqadd.s16 d21, d21, d19 // t17 + vqsub.s16 d24, d16, d28 // t19a + vqadd.s16 d16, d16, d28 // t16a + vqsub.s16 d19, d30, d23 // t20a + vqadd.s16 d30, d30, d23 // t23a + vqsub.s16 d28, d17, d22 // t21 + vqadd.s16 d17, d17, d22 // t22 + vqadd.s16 d23, d26, d29 // t24a + vqsub.s16 d26, d26, d29 // t27a + vqadd.s16 d22, d20, d18 // t25 + vqsub.s16 d20, d20, d18 // t26 + vqsub.s16 d29, d31, d25 // t28a + vqadd.s16 d31, d31, d25 // t31a + + vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a + vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a + vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 + vrshrn.i32 d18, q2, #12 // t18a + vrshrn.i32 d25, q3, #12 // t29a + vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 + vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 + vrshrn.i32 d29, q4, #12 // t19 + vrshrn.i32 d24, q1, #12 // t28 + vneg.s32 q2, q2 // -> t20 + vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 + vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a + vrshrn.i32 d26, q2, #12 // t20 + vrshrn.i32 d19, q3, #12 // t27 + vneg.s32 q4, q4 // -> t21a + vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a + vrshrn.i32 d20, q4, #12 // t21a + vrshrn.i32 d28, q1, #12 // t26a + + vqsub.s16 d2, d16, d30 // t23 + vqadd.s16 d16, d16, d30 // t16 = out16 + vqsub.s16 d3, d31, d23 // t24 + vqadd.s16 d31, d31, d23 // t31 = out31 + vqsub.s16 d23, d21, d17 // t22a + vqadd.s16 d17, d21, d17 // t17a = out17 + vqadd.s16 d30, d27, d22 // t30a = out30 + vqsub.s16 d21, d27, d22 // t25a + vqsub.s16 d27, d18, d20 // t21 + vqadd.s16 d18, d18, d20 // t18 = out18 + vqadd.s16 d4, d29, d26 // t19a = out19 + vqsub.s16 d26, d29, d26 // t20a + vqadd.s16 d29, d25, d28 // t29 = out29 + vqsub.s16 d25, d25, d28 // t26 + vqadd.s16 d28, d24, d19 // t28a = out28 + vqsub.s16 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 + vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 + vrshrn.i32 d20, q2, #12 // t20 + vrshrn.i32 d22, q3, #12 // t27 + + vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a + vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshrn.i32 d26, q2, #12 // t26a + + vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 + vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 + vrshrn.i32 d21, q3, #12 // t21a + vrshrn.i32 d22, q12, #12 // t22 + vrshrn.i32 d25, q2, #12 // t25 + + vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a + vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a + vrshrn.i32 d23, q2, #12 // t23a + vrshrn.i32 d24, q3, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + push {lr} + vmov.i16 d7, #0 + lsl r8, r8, #1 +.if \scale + movw r12, #2896*8 + vdup.16 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_4h_x16_neon + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! + add r6, r6, #32 +.endm + store1 d16, d20, d24, d28 + store1 d17, d21, d25, d29 + store1 d18, d22, d26, d30 + store1 d19, d23, d27, d31 +.purgem store1 + sub r6, r6, #64*4 + + vmov.i16 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_4h_x16_neon + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q13, q12, d27, d26, d25, d24 + transpose_4x4h q11, q10, d23, d22, d21, d20 + transpose_4x4h q9, q8, d19, d18, d17, d16 +.macro store2 r0, r1, r2, r3, shift + vld1.16 {q0, q1}, [r6, :128] + vqsub.s16 d7, d0, \r0 + vqadd.s16 d0, d0, \r0 + vqsub.s16 d6, d1, \r1 + vqadd.s16 d1, d1, \r1 + vqsub.s16 d5, d2, \r2 + vqadd.s16 d2, d2, \r2 + vqsub.s16 d4, d3, \r3 + vqadd.s16 d3, d3, \r3 + vrev64.16 q2, q2 + vrev64.16 q3, q3 + vrshr.s16 q0, q0, #\shift + vrshr.s16 q1, q1, #\shift + vrshr.s16 q2, q2, #\shift + vrshr.s16 q3, q3, #\shift + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d27, d23, d19, \shift + store2 d30, d26, d22, d18, \shift + store2 d29, d25, d21, d17, \shift + store2 d28, d24, d20, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl inv_dct_4h_x16_neon + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl inv_dct32_odd_4h_x16_neon + + neg r9, r8 + mov r10, r6 +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.32 {d2[0]}, [r10, :32], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.32 {d2[1]}, [r10, :32], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.32 {d3[0]}, [r10, :32], r1 + \op\().s16 d5, d5, \r1 + vld1.32 {d3[1]}, [r10, :32], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vaddw.u8 q2, q2, d2 + \op\().s16 d7, d7, \r3 + vqmovun.s16 d2, q2 + vrshr.s16 q3, q3, #4 + vst1.32 {d2[0]}, [r6, :32], r1 + vaddw.u8 q3, q3, d3 + vst1.32 {d2[1]}, [r6, :32], r1 + vqmovun.s16 d3, q3 + vst1.32 {d3[0]}, [r6, :32], r1 + vst1.32 {d3[1]}, [r6, :32], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + // Contrary to the others, this one is only ever used in increments of 8x8 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + push {r4-r7,lr} + vmov.i16 q0, #0 + movrel_local r5, eob_32x32, 2 + + mov r6, #2*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 2 +2: + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r6 +.endr + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + load_add_store_8x8 r0, r7, shiftbits=2 + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r7,lr} + movw r6, #2896*8 + movw r7, #2*(5793-4096)*8 + vdup.i16 d0, r6 + movrel_local r5, eob_16x32\hshort, 2 + vmov.16 d0[1], r7 + + mov r6, #2*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 2 +2: + vmov.i16 q1, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s16, 1 + identity_8x8 d0[1] +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + +.if \w == 16 + load_add_store_8x8 r0, r7, shiftbits=2 +.else + load_add_store_8x8 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q0, #0 + movrel_local r4, eob_8x32 + + mov r12, #2*\h +1: + ldrh lr, [r4], #2 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r12 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs vrshr.s16, 1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + cmp r3, lr +.if \w == 8 + load_add_store_8x8 r0, r5, shiftbits=2 +.else + load_add_store_8x8 r0, r5, shiftbits=3 +.endif + + blt 9f +.if \w == 8 + sub r2, r2, r12, lsl #3 + add r2, r2, #2*8 +.else + sub r0, r0, r1, lsl #3 + add r0, r0, #8 +.endif + b 1b + +9: + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r5, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32 + + mov r8, #2*32 + mov r9, #32 + mov r6, sp +1: + vmov.i16 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #2 + sub r2, r2, r8, lsl #3 + sub r9, r9, #8 + add r2, r2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #2 +.endr + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #8 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + +.irp i, 0, 4 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + cmp r3, #10 + blt 1f +.endif + mov r8, #8*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + +2: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl inv_dct_8h_x8_neon + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.16 {d0, d1, d2}, [r12, :64]! + + vqrdmulh.s16 d23, d16, d0[1] // t63a + vqrdmulh.s16 d16, d16, d0[0] // t32a + vqrdmulh.s16 d22, d17, d0[2] // t62a + vqrdmulh.s16 d17, d17, d0[3] // t33a + vqrdmulh.s16 d21, d18, d1[1] // t61a + vqrdmulh.s16 d18, d18, d1[0] // t34a + vqrdmulh.s16 d20, d19, d1[2] // t60a + vqrdmulh.s16 d19, d19, d1[3] // t35a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t33 + vqsub.s16 d26, d19, d18 // t34 + vqadd.s16 d27, d19, d18 // t35 + vqadd.s16 d28, d20, d21 // t60 + vqsub.s16 d29, d20, d21 // t61 + vqsub.s16 d30, d23, d22 // t62 + vqadd.s16 d31, d23, d22 // t63 + + vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a + vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a + vneg.s32 q2, q2 // t34a + vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a + vrshrn.i32 d26, q2, #12 // t34a + vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a + vrshrn.i32 d29, q3, #12 // t61a + vrshrn.i32 d25, q4, #12 // t33a + vrshrn.i32 d30, q2, #12 // t62a + + vqadd.s16 d16, d24, d27 // t32a + vqsub.s16 d19, d24, d27 // t35a + vqadd.s16 d17, d25, d26 // t33 + vqsub.s16 d18, d25, d26 // t34 + vqsub.s16 d20, d31, d28 // t60a + vqadd.s16 d23, d31, d28 // t63a + vqsub.s16 d21, d30, d29 // t61 + vqadd.s16 d22, d30, d29 // t62 + + vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a + vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a + vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 + vrshrn.i32 d21, q2, #12 // t61a + vrshrn.i32 d18, q3, #12 // t34a + vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 + vrshrn.i32 d20, q4, #12 // t60 + vrshrn.i32 d19, q2, #12 // t35 + + vst1.16 {d16, d17, d18, d19}, [r6, :128]! + vst1.16 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #2*4*0] // t32a + vldr d17, [r9, #2*4*8] // t39a + vldr d18, [r9, #2*4*0] // t63a + vldr d19, [r6, #2*4*8] // t56a + vldr d20, [r6, #2*4*16] // t40a + vldr d21, [r9, #2*4*24] // t47a + vldr d22, [r9, #2*4*16] // t55a + vldr d23, [r6, #2*4*24] // t48a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t39 + vqadd.s16 d26, d18, d19 // t63 + vqsub.s16 d27, d18, d19 // t56 + vqsub.s16 d28, d21, d20 // t40 + vqadd.s16 d29, d21, d20 // t47 + vqadd.s16 d30, d23, d22 // t48 + vqsub.s16 d31, d23, d22 // t55 + + vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a + vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a + vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a + vrshrn.i32 d25, q2, #12 // t56a + vrshrn.i32 d27, q3, #12 // t39a + vneg.s32 q4, q4 // t40a + vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a + vrshrn.i32 d31, q4, #12 // t40a + vrshrn.i32 d28, q2, #12 // t55a + + vqadd.s16 d16, d24, d29 // t32a + vqsub.s16 d19, d24, d29 // t47a + vqadd.s16 d17, d27, d31 // t39 + vqsub.s16 d18, d27, d31 // t40 + vqsub.s16 d20, d26, d30 // t48a + vqadd.s16 d23, d26, d30 // t63a + vqsub.s16 d21, d25, d28 // t55 + vqadd.s16 d22, d25, d28 // t56 + + vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a + vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a + vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 + vrshrn.i32 d18, q2, #12 // t40a + vrshrn.i32 d21, q3, #12 // t55a + vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 + vrshrn.i32 d19, q4, #12 // t47 + vrshrn.i32 d20, q2, #12 // t48 + + vstr d16, [r6, #2*4*0] // t32a + vstr d17, [r9, #2*4*0] // t39 + vstr d18, [r6, #2*4*8] // t40a + vstr d19, [r9, #2*4*8] // t47 + vstr d20, [r6, #2*4*16] // t48 + vstr d21, [r9, #2*4*16] // t55a + vstr d22, [r6, #2*4*24] // t56 + vstr d23, [r9, #2*4*24] // t63a + + add r6, r6, #2*4 + sub r9, r9, #2*4 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.16 {\i}, [\src, :64] + vst1.16 {\zero}, [\src, :64], \strd +.else + vld1.16 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.16 {q8, q9}, [\dst, :128]! + vst1.16 {q10, q11}, [\dst, :128]! + vst1.16 {q12, q13}, [\dst, :128]! + vst1.16 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i16 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + movw \gpr, \val + vdup.16 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.16 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_4h_x16_neon + + store16 r6 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_4h_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.16 {d2}, [r6, :64]! + vld1.16 {d3}, [r6, :64]! + vqadd.s16 d6, d2, \r0 + vqsub.s16 \r0, d2, \r0 + vld1.16 {d4}, [r6, :64]! + vqadd.s16 d7, d3, \r1 + vqsub.s16 \r1, d3, \r1 + vld1.16 {d5}, [r6, :64]! + vqadd.s16 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s16 \r2, d4, \r2 + vst1.16 {d6}, [r6, :64]! + vst1.16 {\r0}, [r10, :64], r9 + vqadd.s16 d3, d5, \r3 + vqsub.s16 \r3, d5, \r3 + vst1.16 {d7}, [r6, :64]! + vst1.16 {\r1}, [r10, :64], r9 + vst1.16 {d2}, [r6, :64]! + vst1.16 {\r2}, [r10, :64], r9 + vst1.16 {d3}, [r6, :64]! + vst1.16 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.16 {d16}, [r7, :64] // in1 (offset 0) + vld1.16 {d17}, [r9, :64] // in31 (offset 15) + vld1.16 {d18}, [r10, :64] // in17 (offset 8) + vld1.16 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.16 {d16}, [r10, :64] // in7 (offset 3) + vld1.16 {d17}, [r11, :64] // in25 (offset 12) + vld1.16 {d18}, [r9, :64] // in23 (offset 11) + vld1.16 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.16 d16, [r10, :64] // in5 (offset 2) + vld1.16 d17, [r11, :64] // in27 (offset 13) + vld1.16 d18, [r9, :64] // in21 (offset 10) + vld1.16 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.16 d16, [r10, :64] // in3 (offset 1) + vld1.16 d17, [r11, :64] // in29 (offset 14) + vld1.16 d18, [r9, :64] // in19 (offset 9) + vld1.16 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x4_neon + vdup.16 q3, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q13, q12, d27, d26, d25, d24 + +.macro store_addsub src0, src1, src2, src3 + vqsub.s16 d3, \src0, \src1 + vqsub.s16 d2, \src2, \src3 + vqadd.s16 d0, \src0, \src1 + vqadd.s16 d1, \src2, \src3 + vrshl.s16 q1, q1, q3 + vrshl.s16 q0, q0, q3 + vrev64.16 q1, q1 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q1}, [r9, :128], r10 +.endm + store_addsub d16, d31, d20, d27 + store_addsub d17, d30, d21, d26 + store_addsub d18, d29, d22, d25 + store_addsub d19, d28, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #2 + sub r9, r9, r10, lsl #2 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + +.macro add_dest_addsub src0, src1, src2, src3 + vld1.32 {d0[0]}, [r6, :32], r1 + vld1.32 {d1[0]}, [r9, :32], r10 + vqadd.s16 d4, \src0, \src1 + vld1.32 {d0[1]}, [r6, :32] + vqadd.s16 d5, \src2, \src3 + vld1.32 {d1[1]}, [r9, :32] + vqsub.s16 d6, \src0, \src1 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vaddw.u8 q2, q2, d0 + vaddw.u8 q3, q3, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 + vst1.32 {d0[0]}, [r6, :32], r1 + vst1.32 {d1[0]}, [r9, :32], r10 + vst1.32 {d0[1]}, [r6, :32], r1 + vst1.32 {d1[1]}, [r9, :32], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_scale_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 4, 8, 12 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel_local r5, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S new file mode 100644 index 00000000000..b06e109ddab --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/msac.S @@ -0,0 +1,575 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 4 +#define DIF 8 +#define RNG 12 +#define CNT 16 +#define ALLOW_UPDATE_CDF 20 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits, align=4 + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro vld1_align_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vld1.16 {\q0}, [\src, :128] +.else + vld1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vld1_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src] +.elseif \n == 8 + vld1.16 {\q0}, [\src] +.else + vld1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vst1_align_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vst1.16 {\q0}, [\src, :128] +.else + vst1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vst1_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src] +.elseif \n == 8 + vst1.16 {\q0}, [\src] +.else + vst1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshr.u16 \d0, \s0, \s3 +.else + vshr.u16 \d1, \s1, \s4 +.if \n == 16 + vshr.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vadd.i16 \d0, \s0, \s3 +.else + vadd.i16 \d1, \s1, \s4 +.if \n == 16 + vadd.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vsub.i16 \d0, \s0, \s3 +.else + vsub.i16 \d1, \s1, \s4 +.if \n == 16 + vsub.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vand \d0, \s0, \s3 +.else + vand \d1, \s1, \s4 +.if \n == 16 + vand \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vcge.u16 \d0, \s0, \s3 +.else + vcge.u16 \d1, \s1, \s4 +.if \n == 16 + vcge.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vrhadd.u16 \d0, \s0, \s3 +.else + vrhadd.u16 \d1, \s1, \s4 +.if \n == 16 + vrhadd.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshl.s16 \d0, \s0, \s3 +.else + vshl.s16 \d1, \s1, \s4 +.if \n == 16 + vshl.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vqdmulh.s16 \d0, \s0, \s3 +.else + vqdmulh.s16 \d1, \s1, \s4 +.if \n == 16 + vqdmulh.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update n + push {r4-r10,lr} + sub sp, sp, #48 + add r8, r0, #RNG + + vld1_align_n d0, q0, q1, r1, \n // cdf + vld1.16 {d16[]}, [r8, :16] // rng + movrel_local r9, coeffs, 30 + vmov.i16 d30, #0x7f00 // 0x7f00 + sub r9, r9, r2, lsl #1 + vmvn.i16 q14, #0x3f // 0xffc0 + add r8, sp, #14 + vand d22, d16, d30 // rng & 0x7f00 + vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng + vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 +.if \n > 4 + vmov d23, d22 +.endif + + vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) + vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r8, r0, #DIF + 2 + + vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) +.if \n == 4 + vmov.i16 d17, #0 +.endif + vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + add r9, sp, #16 + vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) + movrel_local r8, bits + vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access + + vmov d21, d20 + vld1_align_n q12, q12, q13, r8, \n +.if \n == 16 + vmov q11, q10 +.endif + + vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v + + vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask +.if \n == 16 + vadd.i16 q10, q10, q11 +.endif + vadd.i16 d20, d20, d21 // Aggregate mask bits + ldr r4, [r0, #ALLOW_UPDATE_CDF] + vpadd.i16 d20, d20, d20 + lsl r10, r2, #1 + vpadd.i16 d20, d20, d20 + vmov.u16 r3, d20[0] + cmp r4, #0 + rbit r3, r3 + clz lr, r3 // ret + + beq L(renorm) + // update_cdf + ldrh r3, [r1, r10] // count = cdf[n_symbols] + vmov.i8 q10, #0xff +.if \n == 16 + mov r4, #-5 +.else + mvn r12, r2 + mov r4, #-4 + cmn r12, #3 // set C if n_symbols <= 2 +.endif + vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) +.else + lsr r12, r3, #4 // count >> 4 + sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) +.if \n == 4 + vdup.16 d20, r4 // -rate +.else + vdup.16 q10, r4 // -rate +.endif + + sub r3, r3, r3, lsr #5 // count - (count == 32) + vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) + vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate + add r3, r3, #1 // count + (count < 32) + vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate + vst1_align_n d0, q0, q1, r1, \n + strh r3, [r1, r10] +.endm + + decode_update 4 + +L(renorm): + add r8, sp, #16 + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) +L(renorm2): + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 8 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 8 - c + +9: + str r6, [r0, #CNT] + str r7, [r0, #DIF] + + mov r0, lr + add sp, sp, #48 + + pop {r4-r10,pc} +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + push {r4-r10,lr} + vld1.16 {d0}, [r1, :64] // cdf + add r4, r0, #RNG + vmov.i16 d31, #0x7f00 // 0x7f00 + movrel_local r5, coeffs, 30-2*3 + vmvn.i16 d30, #0x3f // 0xffc0 + ldrh r9, [r1, #6] // count = cdf[n_symbols] + vld1.16 {d1[]}, [r4, :16] // rng + movrel_local r4, bits + vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) + add r5, r0, #DIF + 2 + vld1.16 {q8}, [r4, :128] + mov r2, #-24 + vand d20, d0, d30 // cdf & 0xffc0 + ldr r10, [r0, #ALLOW_UPDATE_CDF] + vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + vmov d3, d2 +1: + vand d23, d1, d31 // rng & 0x7f00 + vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r12, sp, #14 + vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) + vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + vmov.i16 d7, #0 + vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng + add r12, sp, #16 + vcge.u16 q2, q1, q3 // c >= v + vst1.16 {q3}, [r12] // store v values to allow indexed access + vand q9, q2, q8 // One bit per halfword set in the mask + + vadd.i16 d18, d18, d19 // Aggregate mask bits + vpadd.i16 d18, d18, d18 + vpadd.i16 d18, d18, d18 + vmov.u16 r3, d18[0] + cmp r10, #0 + add r2, r2, #5 + rbit r3, r3 + add r8, sp, #16 + clz lr, r3 // ret + + beq 2f + // update_cdf + vmov.i8 d22, #0xff + mov r4, #-5 + vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 + sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) + vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) + vdup.16 d18, r4 // -rate + + sub r9, r9, r9, lsr #5 // count - (count == 32) + vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) + vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate + add r9, r9, #1 // count + (count < 32) + vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate + vst1.16 {d0}, [r1, :64] + vand d20, d0, d30 // cdf & 0xffc0 + strh r9, [r1, #6] + +2: + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + vdup.16 d1, r4 + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 40 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 40 - c + +9: + lsl lr, lr, #1 + sub lr, lr, #5 + lsr r12, r7, #16 + adds r2, r2, lr // carry = tok_br < 3 || tok == 15 + vdup.16 q1, r12 + bcc 1b // loop if !carry + add r2, r2, #30 + str r6, [r0, #CNT] + add sp, sp, #48 + str r7, [r0, #DIF] + lsr r0, r2, #1 + pop {r4-r10,pc} +endfunc + +function msac_decode_bool_equi_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + bic r4, r5, #0xff // r &= 0xff00 + add r4, r4, #8 + mov r2, #0 + subs r8, r7, r4, lsl #15 // dif - vw + lsr r4, r4, #1 // v + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + bic r1, r1, #0x3f // f &= ~63 + mul r4, r4, r1 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + push {r4-r10,lr} + ldr r9, [r1] // cdf[0-1] + ldr r5, [r0, #RNG] + movw lr, #0xffc0 + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + and r2, r9, lr // f &= ~63 + mul r4, r4, r2 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + ldr r10, [r0, #ALLOW_UPDATE_CDF] + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + cmp r10, #0 + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + + beq L(renorm2) + + lsr r2, r9, #16 // count = cdf[1] + uxth r9, r9 // cdf[0] + + sub r3, r2, r2, lsr #5 // count - (count >= 32) + lsr r2, r2, #4 // count >> 4 + add r10, r3, #1 // count + (count < 32) + add r2, r2, #4 // rate = (count >> 4) | 4 + + sub r9, r9, lr // cdf[0] -= bit + sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} + asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate + sub r9, r9, r3 // cdf[0] + + strh r9, [r1] + strh r10, [r1, #2] + + b L(renorm2) +endfunc diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S index ea4afc38d6b..6af0158e09b 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/util.S @@ -84,6 +84,23 @@ vtrn.8 \r6, \r7 .endm +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 + vswp \d0, \d4 + vswp \d1, \d5 + vswp \d2, \d6 + vswp \d3, \d7 + + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm + .macro transpose_4x8b q0, q1, r0, r1, r2, r3 vtrn.16 \q0, \q1 @@ -91,4 +108,19 @@ vtrn.8 \r2, \r3 .endm +.macro transpose_4x4h q0, q1, r0, r1, r2, r3 + vtrn.32 \q0, \q1 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +.macro transpose_4x8h r0, r1, r2, r3 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + #endif /* DAV1D_SRC_ARM_32_UTIL_S */ diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S index b6c0c14aab8..245af0e786e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S @@ -58,7 +58,6 @@ // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. -// - Special case functions for e.g. more combinations with identity. const idct_coeffs, align=4 // idct4 @@ -106,7 +105,7 @@ const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs - .short 2896, 2896*8, 1567, 3784, 0, 0, 0, 0 + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 @@ -134,13 +133,6 @@ endconst .endif .endm -.macro smull_sz d0, d1, s0, c, sz - smull \d0\().4s, \s0\().4h, \c -.ifc \sz, .8h - smull2 \d1\().4s, \s0\().8h, \c -.endif -.endm - .macro rshrn_sz d0, s0, s1, shift, sz rshrn \d0\().4h, \s0\().4s, \shift .ifc \sz, .8h @@ -457,14 +449,14 @@ endfunc sqsub \r2\sz, v3\sz, v7\sz .endm -function inv_dct_4x4_neon +function inv_dct_4h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .4h ret endfunc -function inv_dct_8x4_neon +function inv_dct_8h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .8h @@ -497,12 +489,12 @@ endfunc rshrn \o3\().4h, \o3\().4s, #12 .endm -function inv_adst_4x4_neon +function inv_adst_4h_x4_neon, export=1 iadst_4x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_4x4_neon +function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 v19, v18, v17, v16 ret endfunc @@ -563,17 +555,17 @@ endfunc rshrn2 \o3\().8h, v5.4s, #12 .endm -function inv_adst_8x4_neon +function inv_adst_8h_x4_neon, export=1 iadst_8x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_8x4_neon +function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 v19, v18, v17, v16 ret endfunc -function inv_identity_4x4_neon +function inv_identity_4h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.4h, v16.4h, v0.h[0] @@ -587,7 +579,7 @@ function inv_identity_4x4_neon ret endfunc -function inv_identity_8x4_neon +function inv_identity_8h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.8h, v16.8h, v0.h[0] @@ -608,7 +600,7 @@ endfunc .endr .endm -function inv_txfm_add_wht_wht_4x4_neon, export=1 +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 mov x15, x30 movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] @@ -672,7 +664,7 @@ L(itx_4x4_end): endfunc .macro def_fn_4x4 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct @@ -692,8 +684,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1 b L(itx_4x4_end) 1: .endif - adr x4, inv_\txfm1\()_4x4_neon - adr x5, inv_\txfm2\()_4x4_neon + adr x4, inv_\txfm1\()_4h_x4_neon + adr x5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm @@ -749,14 +741,14 @@ def_fn_4x4 identity, flipadst mov \r6\szb, v6\szb // out6 .endm -function inv_dct_8x8_neon +function inv_dct_8h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b ret endfunc -function inv_dct_4x8_neon +function inv_dct_4h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b @@ -830,27 +822,27 @@ endfunc sqneg \o5\()\sz, v3\sz // out5 .endm -function inv_adst_8x8_neon +function inv_adst_8h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h ret endfunc -function inv_flipadst_8x8_neon +function inv_flipadst_8h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h ret endfunc -function inv_adst_4x8_neon +function inv_adst_4h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h ret endfunc -function inv_flipadst_4x8_neon +function inv_flipadst_4h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h ret endfunc -function inv_identity_8x8_neon +function inv_identity_8h_x8_neon, export=1 sqshl v16.8h, v16.8h, #1 sqshl v17.8h, v17.8h, #1 sqshl v18.8h, v18.8h, #1 @@ -862,7 +854,7 @@ function inv_identity_8x8_neon ret endfunc -function inv_identity_4x8_neon +function inv_identity_4h_x8_neon, export=1 sqshl v16.4h, v16.4h, #1 sqshl v17.4h, v17.4h, #1 sqshl v18.4h, v18.4h, #1 @@ -913,17 +905,17 @@ def_fn_8x8_base def_fn_8x8_base identity_ .macro def_fn_8x8 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif - adr x5, inv_\txfm2\()_8x8_neon + adr x5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else - adr x4, inv_\txfm1\()_8x8_neon + adr x4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc @@ -1000,14 +992,14 @@ function inv_txfm_add_4x8_neon endfunc .macro def_fn_48 w, h, txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif - adr x4, inv_\txfm1\()_\h\()x\w\()_neon - adr x5, inv_\txfm2\()_\w\()x\h\()_neon + adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon + adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm @@ -1118,14 +1110,14 @@ def_fns_48 8, 4 mov v22\szb, v3\szb .endm -function inv_dct_8x16_neon +function inv_dct_8h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .8h, .16b ret endfunc -function inv_dct_4x16_neon +function inv_dct_4h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .4h, .8b @@ -1302,27 +1294,27 @@ endfunc sqneg \o9\sz, v7\sz // out9 .endm -function inv_adst_8x16_neon +function inv_adst_8h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b ret endfunc -function inv_flipadst_8x16_neon +function inv_flipadst_8h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b ret endfunc -function inv_adst_4x16_neon +function inv_adst_4h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b ret endfunc -function inv_flipadst_4x16_neon +function inv_flipadst_4h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b ret endfunc -function inv_identity_8x16_neon +function inv_identity_8h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1333,7 +1325,7 @@ function inv_identity_8x16_neon ret endfunc -function inv_identity_4x16_neon +function inv_identity_4h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1376,71 +1368,49 @@ endfunc .endr .endm -function inv_txfm_horz_16x8_neon +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x8_neon mov x14, x30 movi v7.8h, #0 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] - st1 {v7.8h}, [x7], x8 -.endr - blr x4 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - srshr v\i\().8h, v\i\().8h, #2 -.endr - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 - transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 - -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 -.endr - - br x14 -endfunc - -function inv_txfm_horz_identity_16x8_neon - mov x14, x30 - movi v7.8h, #0 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] - st1 {v7.8h}, [x7], x8 -.endr +.if \identity mov w16, #2*(5793-4096)*8 dup v0.4h, w16 - identity_8x16_shift2 v0.h[0] - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 - transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 - -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 -.endr - - br x14 -endfunc - -function inv_txfm_horz_scale_16x8_neon - mov x14, x30 - movi v7.8h, #0 +.elseif \scale mov w16, #2896*8 dup v0.4h, w16 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] +.endif +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr +.if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif +.if \identity + identity_8x16_shift2 v0.h[0] +.else blr x4 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - srshr v\i\().8h, v\i\().8h, #1 +.endif +.if \shift > 0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + srshr \i, \i, #\shift .endr +.endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 +.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h + st1 {\i}, [x6], #16 .endr br x14 endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=0, suffix=_identity function inv_txfm_add_vert_8x16_neon mov x14, x30 @@ -1487,7 +1457,7 @@ function inv_txfm_add_16x16_neon endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif @@ -1495,9 +1465,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1 adr x9, inv_txfm_horz_identity_16x8_neon .else adr x9, inv_txfm_horz_16x8_neon - adr x4, inv_\txfm1\()_8x16_neon + adr x4, inv_\txfm1\()_8h_x16_neon .endif - adr x5, inv_\txfm2\()_8x16_neon + adr x5, inv_\txfm2\()_8h_x16_neon mov x13, #\eob_half b inv_txfm_add_16x16_neon endfunc @@ -1659,17 +1629,17 @@ def_fn_416_base def_fn_416_base identity_ .macro def_fn_416 w, h, txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_4x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else - adr x4, inv_\txfm1\()_4x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_4h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon @@ -1842,12 +1812,12 @@ def_fn_816_base def_fn_816_base identity_ .macro def_fn_816 w, h, txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half .endif @@ -1881,7 +1851,7 @@ def_fn_816 \w, \h, identity, flipadst, 64 def_fns_816 8, 16 def_fns_816 16, 8 -function inv_dct32_odd_8x16_neon +function inv_dct32_odd_8h_x16_neon, export=1 movrel x16, idct_coeffs, 2*16 ld1 {v0.8h, v1.8h}, [x16] sub x16, x16, #2*16 @@ -2059,7 +2029,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 @@ -2089,15 +2059,13 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 .macro store2 r0, r1, shift - ld1 {v4.8h}, [x6], #16 - ld1 {v5.8h}, [x6] + ld1 {v4.8h, v5.8h}, [x6] sqsub v7.8h, v4.8h, \r0 sqsub v6.8h, v5.8h, \r1 - sub x6, x6, #16 sqadd v4.8h, v4.8h, \r0 sqadd v5.8h, v5.8h, \r1 rev64 v6.8h, v6.8h @@ -2106,12 +2074,10 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon srshr v5.8h, v5.8h, #\shift srshr v6.8h, v6.8h, #\shift srshr v7.8h, v7.8h, #\shift - st1 {v4.8h}, [x6], #16 ext v6.16b, v6.16b, v6.16b, #8 - st1 {v5.8h}, [x6], #16 + st1 {v4.8h, v5.8h}, [x6], #32 ext v7.16b, v7.16b, v7.16b, #8 - st1 {v6.8h}, [x6], #16 - st1 {v7.8h}, [x6], #16 + st1 {v6.8h, v7.8h}, [x6], #32 .endm store2 v31.8h, v23.8h, \shift @@ -2139,7 +2105,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 @@ -2152,7 +2118,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon neg x9, x8 mov x10, x6 @@ -2216,7 +2182,7 @@ const eob_8x32 .short 43, 107, 171, 256 endconst -function inv_txfm_add_identity_identity_32x32_neon, export=1 +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_32x32 @@ -2259,7 +2225,7 @@ endfunc .endm .macro def_identity_1632 w, h, wshort, hshort -function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 mov w16, #2896*8 mov w17, #2*(5793-4096)*8 dup v1.4h, w16 @@ -2285,7 +2251,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 .else // 32x16 shift_8_regs sqshl, 1 - identity_8x8 v1.h[1] + identity_8x8 v1.h[1] .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 @@ -2319,12 +2285,13 @@ def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h -function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_8x32 mov w8, #2*\h 1: + ldrh w12, [x13], #2 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 @@ -2337,14 +2304,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif - ldrh w12, [x13], #2 - cmp w3, w12 b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 @@ -2363,7 +2329,7 @@ endfunc def_identity_832 8, 32 def_identity_832 32, 8 -function inv_txfm_add_dct_dct_32x32_neon, export=1 +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 @@ -2411,14 +2377,14 @@ function inv_txfm_add_dct_dct_32x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_16x32_neon, export=1 +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, sp, #(\i*16*2) @@ -2460,13 +2426,13 @@ function inv_txfm_add_dct_dct_16x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x16_neon, export=1 +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8 add x6, sp, #(\i*32*2) @@ -2505,7 +2471,7 @@ function inv_txfm_add_dct_dct_32x16_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_8x32_neon, export=1 +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 @@ -2517,18 +2483,17 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 mov x8, #2*32 mov w9, #32 mov x6, sp - mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 - ld1 {v\i\().8h}, [x7] - st1 {v28.8h}, [x7], x8 + ld1 {v\i\().8h}, [x2] + st1 {v28.8h}, [x2], x8 .endr ldrh w12, [x13], #2 + sub x2, x2, x8, lsl #3 sub w9, w9, #8 - sub x7, x7, x8, lsl #3 - add x7, x7, #2*8 + add x2, x2, #2*8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23 srshr v\i\().8h, v\i\().8h, #2 @@ -2536,10 +2501,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 cmp w3, w12 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23 - st1 {v\i\().8h}, [x6], #16 -.endr + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 b.ge 1b cbz w9, 3f @@ -2564,7 +2528,7 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x8_neon, export=1 +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 @@ -2586,7 +2550,7 @@ function inv_txfm_add_dct_dct_32x8_neon, export=1 .endr add w9, w9, #8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon cmp w9, #32 @@ -2791,7 +2755,7 @@ endfunc .endm .macro def_dct64_func suffix, clear=0, scale=0 -function inv_txfm_dct\suffix\()_8x64_neon +function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 mov x14, x30 mov x6, sp lsl x8, x8, #2 @@ -2804,7 +2768,7 @@ function inv_txfm_dct\suffix\()_8x64_neon add x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon store16 x6 @@ -2817,7 +2781,7 @@ function inv_txfm_dct\suffix\()_8x64_neon sub x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 @@ -3040,7 +3004,11 @@ endfunc .macro sub_sp space #ifdef _WIN32 -.if \space > 4096 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) @@ -3050,16 +3018,14 @@ endfunc #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 +.endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif -.else - sub sp, sp, #\space -.endif #endif .endm -function inv_txfm_add_dct_dct_64x64_neon, export=1 +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 @@ -3079,7 +3045,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3104,7 +3070,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3113,7 +3079,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_64x32_neon, export=1 +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 @@ -3133,7 +3099,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-1 // shift - bl inv_txfm_dct_clear_scale_8x64_neon + bl inv_txfm_dct_clear_scale_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3166,7 +3132,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x64_neon, export=1 +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 @@ -3207,7 +3173,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1 .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3216,7 +3182,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_64x16_neon, export=1 +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 @@ -3232,14 +3198,16 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f - ldrh w12, [x13], #2 .endif add x7, x2, #(\i*2) mov x8, #16*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon +.if \i < 8 + ldrh w12, [x13], #2 +.endif .endr b 3f @@ -3256,7 +3224,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 b.gt 2b 3: - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) @@ -3268,7 +3236,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_16x64_neon, export=1 +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 @@ -3279,7 +3247,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, x5, #(\i*16*2) .if \i > 0 @@ -3310,7 +3278,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1 .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S new file mode 100644 index 00000000000..266f57e36ee --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S @@ -0,0 +1,3526 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, +// int bitdepth_max); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro mul_mla d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mla \d\().4s, \s1\().4s, \c1 +.endm + +.macro mul_mls d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mls \d\().4s, \s1\().4s, \c1 +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , , \dst, \src + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src + load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src + load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src + load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src + load_add_store , , , , , v31.8h, v30.8h, \dst, \src + load_add_store , , , , , , v31.8h, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits + load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits + load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src + load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src + load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src + load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src + load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src + load_add_store4 , , , , , , , , v30.d, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src + load_add_store4 , , , , , , , , v22.d, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v0.2s, w16 + sqrdmulh v20.4s, v16.4s, v0.s[0] + str wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v20.4s, v20.4s, v0.s[0] +.endif +.if \shift > 0 + sqrshrn v16.4h, v20.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift +.else + sqxtn v16.4h, v20.4s + sqxtn2 v16.8h, v20.4s +.endif + sqrdmulh v16.8h, v16.8h, v0.h[1] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + subs w4, w4, #4 + ld1 {v1.d}[1], [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sub x0, x0, x1, lsl #2 + sqadd v1.8h, v1.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h}, [x0], x1 + subs w4, w4, #4 + ld1 {v1.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v2.8h}, [x0], x1 + sqadd v1.8h, v1.8h, v16.8h + ld1 {v3.8h}, [x0], x1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sub x0, x0, x1, lsl #2 + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + st1 {v0.8h}, [x0], x1 + smin v2.8h, v2.8h, v31.8h + st1 {v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h}, [x0], x1 + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, x1, lsl #1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + st1 {v0.8h, v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x1, x1, #64 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, #64 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v16.8h + sqadd v6.8h, v6.8h, v16.8h + sqadd v7.8h, v7.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smax v6.8h, v6.8h, v30.8h + smax v7.8h, v7.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + smin v6.8h, v6.8h, v31.8h + smin v7.8h, v7.8h, v31.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4s, v16.4s, v17.4s + sub v21.4s, v18.4s, v19.4s + sub v20.4s, v16.4s, v21.4s + sshr v20.4s, v20.4s, #1 + sub v18.4s, v20.4s, v17.4s + sub v17.4s, v20.4s, v19.4s + add v19.4s, v21.4s, v18.4s + sub v16.4s, v16.4s, v17.4s +.endm + +.macro idct_4 r0, r1, r2, r3 + mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] + mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] + srshr v6.4s, v6.4s, #12 + srshr v7.4s, v4.4s, #12 + srshr v2.4s, v2.4s, #12 + srshr v3.4s, v3.4s, #12 + sqadd \r0\().4s, v2.4s, v6.4s + sqsub \r3\().4s, v2.4s, v6.4s + sqadd \r1\().4s, v3.4s, v7.4s + sqsub \r2\().4s, v3.4s, v7.4s +.endm + +function inv_dct_4s_x4_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] + idct_4 v16, v17, v18, v19 + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.4s}, [x16] + + sub v3.4s, v16.4s, v18.4s + mul v4.4s, v16.4s, v0.s[0] + mla v4.4s, v18.4s, v0.s[1] + mla v4.4s, v19.4s, v0.s[2] + mul v7.4s, v17.4s, v0.s[3] + add v3.4s, v3.4s, v19.4s + mul v5.4s, v16.4s, v0.s[2] + mls v5.4s, v18.4s, v0.s[0] + mls v5.4s, v19.4s, v0.s[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[3] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + srshr \o0\().4s, \o0\().4s, #12 + srshr \o2\().4s, \o2\().4s, #12 + srshr \o1\().4s, \o1\().4s, #12 + srshr \o3\().4s, \o3\().4s, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x4_neon + movz w16, #(5793-4096)*8, lsl #16 + dup v0.2s, w16 + sqrdmulh v4.4s, v16.4s, v0.s[0] + sqrdmulh v5.4s, v17.4s, v0.s[0] + sqrdmulh v6.4s, v18.4s, v0.s[0] + sqrdmulh v7.4s, v19.4s, v0.s[0] + sqadd v16.4s, v16.4s, v4.4s + sqadd v17.4s, v17.4s, v5.4s + sqadd v18.4s, v18.4s, v6.4s + sqadd v19.4s, v19.4s, v7.4s + ret +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + mov x15, x30 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + sshr v16.4s, v16.4s, #2 + sshr v17.4s, v17.4s, #2 + sshr v18.4s, v18.4s, #2 + sshr v19.4s, v19.4s, #2 + + iwht4 + + st1 {v30.4s, v31.4s}, [x2], #32 + transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.d}[0], [x0], x1 + sqxtn v16.4h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqxtn2 v16.8h, v17.4s + ld1 {v1.d}[0], [x0], x1 + sqxtn v18.4h, v18.4s + ld1 {v1.d}[1], [x0], x1 + sqxtn2 v18.8h, v19.4s + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + blr x4 + + st1 {v30.4s, v31.4s}, [x2], #32 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.d}[0], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x0, x0, x1, lsl #2 + sqadd v16.8h, v16.8h, v0.8h + sqadd v18.8h, v18.8h, v1.8h + smax v16.8h, v16.8h, v30.8h + smax v18.8h, v18.8h, v30.8h + smin v16.8h, v16.8h, v31.8h + st1 {v16.d}[0], [x0], x1 + smin v18.8h, v18.8h, v31.8h + st1 {v16.d}[1], [x0], x1 + st1 {v18.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + + br x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v4.2s, w16 + str wzr, [x2] + sqrdmulh v16.4s, v16.4s, v4.s[0] + ld1 {v0.d}[0], [x0], x1 + sqxtn v20.4h, v16.4s + sqxtn2 v20.8h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqrdmulh v20.8h, v20.8h, v4.h[1] + ld1 {v1.d}[0], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.d}[1], [x0], x1 + srshr v18.8h, v20.8h, #4 + movi v30.8h, #0 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4s_x4_neon + movrel x5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4 \r0, \r2, \r4, \r6 + + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a + mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a + mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a + srshr \r1\().4s, v2.4s, #12 // t4a + srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r3\().4s, v6.4s, #12 // t5a + srshr \r5\().4s, v7.4s, #12 // taa + + sqadd v2.4s, \r1\().4s, \r3\().4s // t4 + sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a + sqadd v3.4s, \r7\().4s, \r5\().4s // t7 + sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a + + mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 + mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 + srshr v4.4s, v4.4s, #12 // t5 + srshr v5.4s, v6.4s, #12 // t6 + + sqsub \r7\().4s, \r0\().4s, v3.4s // out7 + sqadd \r0\().4s, \r0\().4s, v3.4s // out0 + sqadd \r1\().4s, \r2\().4s, v5.4s // out1 + sqsub v6.4s, \r2\().4s, v5.4s // out6 + sqadd \r2\().4s, \r4\().4s, v4.4s // out2 + sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r3\().4s, \r6\().4s, v2.4s // out3 + sqsub \r4\().4s, \r6\().4s, v2.4s // out4 + mov \r6\().16b, v6.16b // out6 +.endm + +function inv_dct_4s_x8_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 + movrel x16, iadst8_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v23, v16, v0.s[0], v0.s[1] + mul_mls v4, v23, v16, v0.s[1], v0.s[0] + mul_mla v6, v21, v18, v0.s[2], v0.s[3] + srshr v16.4s, v2.4s, #12 // t0a + srshr v23.4s, v4.4s, #12 // t1a + mul_mls v2, v21, v18, v0.s[3], v0.s[2] + mul_mla v4, v19, v20, v1.s[0], v1.s[1] + srshr v18.4s, v6.4s, #12 // t2a + srshr v21.4s, v2.4s, #12 // t3a + mul_mls v6, v19, v20, v1.s[1], v1.s[0] + mul_mla v2, v17, v22, v1.s[2], v1.s[3] + srshr v20.4s, v4.4s, #12 // t4a + srshr v19.4s, v6.4s, #12 // t5a + mul_mls v4, v17, v22, v1.s[3], v1.s[2] + srshr v22.4s, v2.4s, #12 // t6a + srshr v17.4s, v4.4s, #12 // t7a + + ld1 {v0.4s}, [x16] + + sqadd v2.4s, v16.4s, v20.4s // t0 + sqsub v3.4s, v16.4s, v20.4s // t4 + sqadd v4.4s, v23.4s, v19.4s // t1 + sqsub v5.4s, v23.4s, v19.4s // t5 + sqadd v6.4s, v18.4s, v22.4s // t2 + sqsub v7.4s, v18.4s, v22.4s // t6 + sqadd v18.4s, v21.4s, v17.4s // t3 + sqsub v19.4s, v21.4s, v17.4s // t7 + + mul_mla v16, v3, v5, v0.s[3], v0.s[2] + mul_mls v20, v3, v5, v0.s[2], v0.s[3] + mul_mls v22, v19, v7, v0.s[3], v0.s[2] + + srshr v3.4s, v16.4s, #12 // t4a + srshr v5.4s, v20.4s, #12 // t5a + + mul_mla v16, v19, v7, v0.s[2], v0.s[3] + + srshr v7.4s, v22.4s, #12 // t6a + srshr v19.4s, v16.4s, #12 // t7a + + sqadd \o0\().4s, v2.4s, v6.4s // out0 + sqsub v2.4s, v2.4s, v6.4s // t2 + sqadd \o7\().4s, v4.4s, v18.4s // out7 + sqsub v4.4s, v4.4s, v18.4s // t3 + sqneg \o7\().4s, \o7\().4s // out7 + + sqadd \o1\().4s, v3.4s, v7.4s // out1 + sqsub v3.4s, v3.4s, v7.4s // t6 + sqadd \o6\().4s, v5.4s, v19.4s // out6 + sqsub v5.4s, v5.4s, v19.4s // t7 + sqneg \o1\().4s, \o1\().4s // out1 + + mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) + mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) + mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) + srshr v2.4s, v18.4s, #12 // out3 + mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) + srshr v3.4s, v20.4s, #12 // out5 + srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) + srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) + + sqneg \o3\().4s, v2.4s // out3 + sqneg \o5\().4s, v3.4s // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +function inv_flipadst_4s_x8_neon + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x8_neon + sqshl v16.4s, v16.4s, #1 + sqshl v17.4s, v17.4s, #1 + sqshl v18.4s, v18.4s, #1 + sqshl v19.4s, v19.4s, #1 + sqshl v20.4s, v20.4s, #1 + sqshl v21.4s, v21.4s, #1 + sqshl v22.4s, v22.4s, #1 + sqshl v23.4s, v23.4s, #1 + ret +endfunc + +function inv_txfm_add_8x8_neon + movi v31.4s, #0 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + + blr x5 + + load_add_store_8x8 x0, x7 + br x15 +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + movrel x5, X(inv_\txfm2\()_8h_x8_neon) + mov w13, #\eob_half + adr x4, inv_\txfm1\()_4s_x8_neon + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + sqxtn v20.4h, v20.4s + sqxtn v21.4h, v21.4s + sqxtn v22.4h, v22.4s + sqxtn v23.4h, v23.4s + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + br x15 +endfunc + +function inv_txfm_add_4x8_neon + movz w16, #2896*8, lsl #16 + movi v31.4s, #0 + dup v30.2s, w16 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v20.4h, v16.4s + sqxtn v21.4h, v17.4s + sqxtn v22.4h, v18.4s + sqxtn v23.4h, v19.4s + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f + +1: +.irp i, v20, v21, v22, v23 + movi \i\().4h, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x8 x0, x7 + br x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov w13, #\eob_half +.endif + movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +function inv_dct_4s_x16_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #32 + + mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a + mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a + srshr v17.4s, v2.4s, #12 // t8a + srshr v31.4s, v4.4s, #12 // t15a + mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a + mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + srshr v23.4s, v6.4s, #12 // t9a + srshr v25.4s, v2.4s, #12 // t14a + mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a + mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a + srshr v21.4s, v4.4s, #12 // t10a + srshr v27.4s, v6.4s, #12 // t13a + mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + srshr v19.4s, v2.4s, #12 // t11a + srshr v29.4s, v4.4s, #12 // t12a + + ld1 {v0.4s}, [x16] + + sqsub v2.4s, v17.4s, v23.4s // t9 + sqadd v17.4s, v17.4s, v23.4s // t8 + sqsub v3.4s, v31.4s, v25.4s // t14 + sqadd v31.4s, v31.4s, v25.4s // t15 + sqsub v23.4s, v19.4s, v21.4s // t10 + sqadd v19.4s, v19.4s, v21.4s // t11 + sqadd v25.4s, v29.4s, v27.4s // t12 + sqsub v29.4s, v29.4s, v27.4s // t13 + + mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a + mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a + srshr v21.4s, v4.4s, #12 // t9a + srshr v27.4s, v6.4s, #12 // t14a + + mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a + srshr v29.4s, v4.4s, #12 // t13a + neg v6.4s, v6.4s + srshr v23.4s, v6.4s, #12 // t10a + + sqsub v2.4s, v17.4s, v19.4s // t11a + sqadd v17.4s, v17.4s, v19.4s // t8a + sqsub v3.4s, v31.4s, v25.4s // t12a + sqadd v31.4s, v31.4s, v25.4s // t15a + sqadd v19.4s, v21.4s, v23.4s // t9 + sqsub v21.4s, v21.4s, v23.4s // t10 + sqsub v25.4s, v27.4s, v29.4s // t13 + sqadd v27.4s, v27.4s, v29.4s // t14 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 + mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a + + srshr v4.4s, v4.4s, #12 // t11 + srshr v5.4s, v6.4s, #12 // t12 + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + srshr v2.4s, v2.4s, #12 // t10a + srshr v3.4s, v6.4s, #12 // t13a + + sqadd v6.4s, v16.4s, v31.4s // out0 + sqsub v31.4s, v16.4s, v31.4s // out15 + mov v16.16b, v6.16b + sqadd v23.4s, v30.4s, v17.4s // out7 + sqsub v7.4s, v30.4s, v17.4s // out8 + sqadd v17.4s, v18.4s, v27.4s // out1 + sqsub v30.4s, v18.4s, v27.4s // out14 + sqadd v18.4s, v20.4s, v3.4s // out2 + sqsub v29.4s, v20.4s, v3.4s // out13 + sqadd v3.4s, v28.4s, v19.4s // out6 + sqsub v25.4s, v28.4s, v19.4s // out9 + sqadd v19.4s, v22.4s, v5.4s // out3 + sqsub v28.4s, v22.4s, v5.4s // out12 + sqadd v20.4s, v24.4s, v4.4s // out4 + sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v21.4s, v26.4s, v2.4s // out5 + sqsub v26.4s, v26.4s, v2.4s // out10 + mov v24.16b, v7.16b + mov v22.16b, v3.16b + + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel x16, iadst16_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 + mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 + srshr v16.4s, v2.4s, #12 // t0 + srshr v31.4s, v4.4s, #12 // t1 + mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 + mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 + srshr v18.4s, v6.4s, #12 // t2 + srshr v29.4s, v2.4s, #12 // t3 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 + mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 + srshr v20.4s, v4.4s, #12 // t4 + srshr v27.4s, v6.4s, #12 // t5 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 + ld1 {v0.4s, v1.4s}, [x16] + movrel x16, idct_coeffs + mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 + srshr v22.4s, v2.4s, #12 // t6 + srshr v25.4s, v4.4s, #12 // t7 + mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 + mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 + srshr v23.4s, v6.4s, #12 // t8 + srshr v24.4s, v2.4s, #12 // t9 + mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 + mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 + srshr v21.4s, v4.4s, #12 // t10 + srshr v26.4s, v6.4s, #12 // t11 + mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 + mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 + srshr v19.4s, v2.4s, #12 // t12 + srshr v28.4s, v4.4s, #12 // t13 + mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 + srshr v17.4s, v6.4s, #12 // t14 + srshr v30.4s, v2.4s, #12 // t15 + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v23.4s // t8a + sqadd v16.4s, v16.4s, v23.4s // t0a + sqsub v3.4s, v31.4s, v24.4s // t9a + sqadd v31.4s, v31.4s, v24.4s // t1a + sqadd v23.4s, v18.4s, v21.4s // t2a + sqsub v18.4s, v18.4s, v21.4s // t10a + sqadd v24.4s, v29.4s, v26.4s // t3a + sqsub v29.4s, v29.4s, v26.4s // t11a + sqadd v21.4s, v20.4s, v19.4s // t4a + sqsub v20.4s, v20.4s, v19.4s // t12a + sqadd v26.4s, v27.4s, v28.4s // t5a + sqsub v27.4s, v27.4s, v28.4s // t13a + sqadd v19.4s, v22.4s, v17.4s // t6a + sqsub v22.4s, v22.4s, v17.4s // t14a + sqadd v28.4s, v25.4s, v30.4s // t7a + sqsub v25.4s, v25.4s, v30.4s // t15a + + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 + mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 + mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 + srshr v17.4s, v4.4s, #12 // t8 + srshr v30.4s, v6.4s, #12 // t9 + mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 + srshr v18.4s, v2.4s, #12 // t10 + srshr v29.4s, v4.4s, #12 // t11 + mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 + srshr v27.4s, v6.4s, #12 // t12 + srshr v20.4s, v2.4s, #12 // t13 + mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 + srshr v25.4s, v4.4s, #12 // t14 + srshr v22.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t4 + sqadd v16.4s, v16.4s, v21.4s // t0 + sqsub v3.4s, v31.4s, v26.4s // t5 + sqadd v31.4s, v31.4s, v26.4s // t1 + sqadd v21.4s, v23.4s, v19.4s // t2 + sqsub v23.4s, v23.4s, v19.4s // t6 + sqadd v26.4s, v24.4s, v28.4s // t3 + sqsub v24.4s, v24.4s, v28.4s // t7 + sqadd v19.4s, v17.4s, v27.4s // t8a + sqsub v17.4s, v17.4s, v27.4s // t12a + sqadd v28.4s, v30.4s, v20.4s // t9a + sqsub v30.4s, v30.4s, v20.4s // t13a + sqadd v27.4s, v18.4s, v25.4s // t10a + sqsub v18.4s, v18.4s, v25.4s // t14a + sqadd v20.4s, v29.4s, v22.4s // t11a + sqsub v29.4s, v29.4s, v22.4s // t15a + + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a + mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a + mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a + srshr v22.4s, v4.4s, #12 // t4a + srshr v25.4s, v6.4s, #12 // t5a + mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a + mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 + srshr v24.4s, v2.4s, #12 // t6a + srshr v23.4s, v4.4s, #12 // t7a + mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 + mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 + srshr v17.4s, v6.4s, #12 // t12 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 + srshr v29.4s, v2.4s, #12 // t13 + srshr v30.4s, v4.4s, #12 // t14 + srshr v18.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t2a +.ifc \o0, v16 + sqadd \o0\().4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 +.else + sqadd v4.4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 + mov \o0\().16b, v4.16b +.endif + sqneg \o15\().4s, \o15\().4s // out15 + + sqsub v3.4s, v29.4s, v18.4s // t15a + sqadd \o13\().4s, v29.4s, v18.4s // out13 + sqadd \o2\().4s, v17.4s, v30.4s // out2 + sqsub v26.4s, v17.4s, v30.4s // t14a + sqneg \o13\().4s, \o13\().4s // out13 + + sqadd \o1\().4s, v19.4s, v27.4s // out1 + sqsub v27.4s, v19.4s, v27.4s // t10 + sqadd \o14\().4s, v28.4s, v20.4s // out14 + sqsub v20.4s, v28.4s, v20.4s // t11 + sqneg \o1\().4s, \o1\().4s // out1 + + sqadd \o3\().4s, v22.4s, v24.4s // out3 + sqsub v22.4s, v22.4s, v24.4s // t6 + sqadd \o12\().4s, v25.4s, v23.4s // out12 + sqsub v23.4s, v25.4s, v23.4s // t7 + sqneg \o3\().4s, \o3\().4s // out3 + + mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) + mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) + mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) + + srshr v24.4s, v24.4s, #12 // out8 + srshr v4.4s, v4.4s, #12 // out7 + srshr v5.4s, v6.4s, #12 // out5 + mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) + mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) + srshr v26.4s, v6.4s, #12 // out10 + + mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) + mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) + mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) + + srshr \o4\().4s, v2.4s, #12 // out4 + srshr v6.4s, v6.4s, #12 // out11 + srshr v7.4s, v21.4s, #12 // out9 + srshr \o6\().4s, v22.4s, #12 // out6 + +.ifc \o8, v23 + mov \o8\().16b, v24.16b + mov \o10\().16b, v26.16b +.endif + + sqneg \o7\().4s, v4.4s // out7 + sqneg \o5\().4s, v5.4s // out5 + sqneg \o11\().4s, v6.4s // out11 + sqneg \o9\().4s, v7.4s // out9 +.endm + +function inv_adst_4s_x16_neon + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + ret +endfunc + +function inv_flipadst_4s_x16_neon + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x16_neon + movz w16, #2*(5793-4096)*8, lsl #16 + dup v0.2s, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4s, v\i\().4s, v0.s[0] + sqadd v\i\().4s, v\i\().4s, v\i\().4s + sqadd v\i\().4s, v\i\().4s, v2.4s +.endr + ret +endfunc + +.macro identity_4x16_shift1 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + srshr v3.4s, v3.4s, #1 + sqadd \i, \i, v3.4s +.endr +.endm + +.macro identity_4x16 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v3.4s +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + mov x14, x30 + movi v7.4s, #0 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + blr x4 + sqrshrn v16.4h, v16.4s, #\shift + sqrshrn v17.4h, v17.4s, #\shift + sqrshrn v18.4h, v18.4s, #\shift + sqrshrn v19.4h, v19.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift + sqrshrn2 v17.8h, v21.4s, #\shift + sqrshrn2 v18.8h, v22.4s, #\shift + sqrshrn2 v19.8h, v23.4s, #\shift + sqrshrn v20.4h, v24.4s, #\shift + sqrshrn v21.4h, v25.4s, #\shift + sqrshrn v22.4h, v26.4s, #\shift + sqrshrn v23.4h, v27.4s, #\shift + sqrshrn2 v20.8h, v28.4s, #\shift + sqrshrn2 v21.8h, v29.4s, #\shift + sqrshrn2 v22.8h, v30.4s, #\shift + sqrshrn2 v23.8h, v31.4s, #\shift + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 + +.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h + st1 {\i}, [x6], #16 +.endr + + br x14 +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + br x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 + ldrh w12, [x13], #2 +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*16*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + br x15 +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + adr x4, inv_\txfm1\()_4s_x16_neon + movrel x5, X(inv_\txfm2\()_8h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_16x16 +.else + movrel x13, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_16x16_identity +.else + movrel x13, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + mov x15, x30 + movi v4.4s, #0 + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], #16 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + sqrshrn v16.4h, v24.4s, #1 + sqrshrn v17.4h, v25.4s, #1 + sqrshrn v18.4h, v26.4s, #1 + sqrshrn v19.4h, v27.4s, #1 + sqrshrn2 v16.8h, v28.4s, #1 + sqrshrn2 v17.8h, v29.4s, #1 + sqrshrn2 v18.8h, v30.4s, #1 + sqrshrn2 v19.8h, v31.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #16 + load_add_store_8x4 x6, x7 + + br x15 +endfunc + +function inv_txfm_add_4x16_neon + ldrh w12, [x13, #4] + mov x15, x30 + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v28.4h, v16.4s, #1 + rshrn v29.4h, v17.4s, #1 + rshrn v30.4h, v18.4s, #1 + rshrn v31.4h, v19.4s, #1 + transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 + + b 2f +1: +.irp i, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v24.4h, v16.4s, #1 + rshrn v25.4h, v17.4s, #1 + rshrn v26.4h, v18.4s, #1 + rshrn v27.4h, v19.4s, #1 + transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v20.4h, v16.4s, #1 + rshrn v21.4h, v17.4s, #1 + rshrn v22.4h, v18.4s, #1 + rshrn v23.4h, v19.4s, #1 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f +1: +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + movi \i, #0 +.endr +2: + + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v2.4s}, [x2], x11 +.endr + blr x4 + rshrn v16.4h, v16.4s, #1 + rshrn v17.4h, v17.4s, #1 + rshrn v18.4h, v18.4s, #1 + rshrn v19.4h, v19.4s, #1 + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x16 x0, x6 + + br x15 +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_4x16 +.else + movrel x13, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_4x16_identity2 +.else + movrel x13, eob_4x16 +.endif +.endif +.else + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct +def_fn_416 \w, \h, identity, identity +def_fn_416 \w, \h, dct, adst +def_fn_416 \w, \h, dct, flipadst +def_fn_416 \w, \h, dct, identity +def_fn_416 \w, \h, adst, dct +def_fn_416 \w, \h, adst, adst +def_fn_416 \w, \h, adst, flipadst +def_fn_416 \w, \h, flipadst, dct +def_fn_416 \w, \h, flipadst, adst +def_fn_416 \w, \h, flipadst, flipadst +def_fn_416 \w, \h, identity, dct +def_fn_416 \w, \h, adst, identity +def_fn_416 \w, \h, flipadst, identity +def_fn_416 \w, \h, identity, adst +def_fn_416 \w, \h, identity, flipadst +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +function inv_txfm_add_16x8_neon + mov x15, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + sqrshrn v12.4h, v24.4s, #1 + sqrshrn v13.4h, v25.4s, #1 + sqrshrn v14.4h, v26.4s, #1 + sqrshrn v15.4h, v27.4s, #1 + sqrshrn2 v12.8h, v28.4s, #1 + sqrshrn2 v13.8h, v29.4s, #1 + sqrshrn2 v14.8h, v30.4s, #1 + sqrshrn2 v15.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 + + b 2f +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h + movi \i, #0 +.endr +2: + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + movi v4.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + sqrshrn v8.4h, v24.4s, #1 + sqrshrn v9.4h, v25.4s, #1 + sqrshrn v10.4h, v26.4s, #1 + sqrshrn v11.4h, v27.4s, #1 + sqrshrn2 v8.8h, v28.4s, #1 + sqrshrn2 v9.8h, v29.4s, #1 + sqrshrn2 v10.8h, v30.4s, #1 + sqrshrn2 v11.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + + mov v16.16b, v8.16b + mov v17.16b, v9.16b + mov v18.16b, v10.16b + mov v19.16b, v11.16b + mov v20.16b, v12.16b + mov v21.16b, v13.16b + mov v22.16b, v14.16b + mov v23.16b, v15.16b + + blr x5 + + add x0, x0, #16 + load_add_store_8x8 x0, x7 + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x15 +endfunc + +function inv_txfm_add_8x16_neon + mov x15, x30 + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + ldrh w12, [x13, #4] + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + sqrshrn2 v28.8h, v20.4s, #1 + sqrshrn2 v29.8h, v21.4s, #1 + sqrshrn2 v30.8h, v22.4s, #1 + sqrshrn2 v31.8h, v23.4s, #1 + transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h + movi \i, #0 +.endr + +2: + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + blr x5 + + load_add_store_8x16 x0, x6 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x20 + + br x15 +endfunc + +const eob_8x16 + .short 10, 43, 75, 128 +endconst + +const eob_8x16_identity1 + .short 4, 64, 96, 128 +endconst + +const eob_8x16_identity2 + .short 4, 8, 12, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_8x16 +.else + movrel x13, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_8x16_identity2 +.else + movrel x13, eob_8x16 +.endif +.endif +.if \h == 8 + ldrh w13, [x13] +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4s_x16_neon + movrel x16, idct_coeffs, 4*16 + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a + mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a + mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a + srshr v16.4s, v2.4s, #12 // t16a + srshr v31.4s, v4.4s, #12 // t31a + mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a + mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a + srshr v24.4s, v6.4s, #12 // t17a + srshr v23.4s, v2.4s, #12 // t30a + mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a + mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a + srshr v20.4s, v4.4s, #12 // t18a + srshr v27.4s, v6.4s, #12 // t29a + mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #4*24 + mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a + srshr v28.4s, v2.4s, #12 // t19a + srshr v19.4s, v4.4s, #12 // t28a + mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a + mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a + srshr v18.4s, v6.4s, #12 // t20a + srshr v29.4s, v2.4s, #12 // t27a + mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a + mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a + srshr v26.4s, v4.4s, #12 // t21a + srshr v21.4s, v6.4s, #12 // t26a + mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a + mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a + srshr v22.4s, v2.4s, #12 // t22a + srshr v25.4s, v4.4s, #12 // t25a + mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a + srshr v30.4s, v6.4s, #12 // t23a + srshr v17.4s, v2.4s, #12 // t24a + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v24.4s // t17 + sqadd v16.4s, v16.4s, v24.4s // t16 + sqsub v3.4s, v31.4s, v23.4s // t30 + sqadd v31.4s, v31.4s, v23.4s // t31 + sqsub v24.4s, v28.4s, v20.4s // t18 + sqadd v28.4s, v28.4s, v20.4s // t19 + sqadd v23.4s, v18.4s, v26.4s // t20 + sqsub v18.4s, v18.4s, v26.4s // t21 + sqsub v20.4s, v30.4s, v22.4s // t22 + sqadd v30.4s, v30.4s, v22.4s // t23 + sqadd v26.4s, v17.4s, v25.4s // t24 + sqsub v17.4s, v17.4s, v25.4s // t25 + sqsub v22.4s, v29.4s, v21.4s // t26 + sqadd v29.4s, v29.4s, v21.4s // t27 + sqadd v25.4s, v19.4s, v27.4s // t28 + sqsub v19.4s, v19.4s, v27.4s // t29 + + mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a + mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a + mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a + srshr v21.4s, v4.4s, #12 // t17a + srshr v27.4s, v6.4s, #12 // t30a + neg v2.4s, v2.4s // -> t18a + mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a + srshr v19.4s, v2.4s, #12 // t18a + srshr v24.4s, v4.4s, #12 // t29a + mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a + mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + srshr v22.4s, v6.4s, #12 // t21a + srshr v18.4s, v2.4s, #12 // t26a + neg v4.4s, v4.4s // -> t22a + mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a + srshr v17.4s, v4.4s, #12 // t22a + srshr v20.4s, v6.4s, #12 // t25a + + sqsub v2.4s, v27.4s, v24.4s // t29 + sqadd v27.4s, v27.4s, v24.4s // t30 + sqsub v3.4s, v21.4s, v19.4s // t18 + sqadd v21.4s, v21.4s, v19.4s // t17 + sqsub v24.4s, v16.4s, v28.4s // t19a + sqadd v16.4s, v16.4s, v28.4s // t16a + sqsub v19.4s, v30.4s, v23.4s // t20a + sqadd v30.4s, v30.4s, v23.4s // t23a + sqsub v28.4s, v17.4s, v22.4s // t21 + sqadd v17.4s, v17.4s, v22.4s // t22 + sqadd v23.4s, v26.4s, v29.4s // t24a + sqsub v26.4s, v26.4s, v29.4s // t27a + sqadd v22.4s, v20.4s, v18.4s // t25 + sqsub v20.4s, v20.4s, v18.4s // t26 + sqsub v29.4s, v31.4s, v25.4s // t28a + sqadd v31.4s, v31.4s, v25.4s // t31a + + mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a + mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a + mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 + srshr v18.4s, v4.4s, #12 // t18a + srshr v25.4s, v6.4s, #12 // t29a + mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 + srshr v29.4s, v2.4s, #12 // t19 + srshr v24.4s, v4.4s, #12 // t28 + neg v6.4s, v6.4s // -> t20 + mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 + mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + srshr v26.4s, v6.4s, #12 // t20 + srshr v19.4s, v2.4s, #12 // t27 + neg v4.4s, v4.4s // -> t21a + mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a + srshr v20.4s, v4.4s, #12 // t21a + srshr v28.4s, v6.4s, #12 // t26a + + sqsub v2.4s, v16.4s, v30.4s // t23 + sqadd v16.4s, v16.4s, v30.4s // t16 = out16 + sqsub v3.4s, v31.4s, v23.4s // t24 + sqadd v31.4s, v31.4s, v23.4s // t31 = out31 + sqsub v23.4s, v21.4s, v17.4s // t22a + sqadd v17.4s, v21.4s, v17.4s // t17a = out17 + sqadd v30.4s, v27.4s, v22.4s // t30a = out30 + sqsub v21.4s, v27.4s, v22.4s // t25a + sqsub v27.4s, v18.4s, v20.4s // t21 + sqadd v18.4s, v18.4s, v20.4s // t18 = out18 + sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqsub v26.4s, v29.4s, v26.4s // t20a + sqadd v29.4s, v25.4s, v28.4s // t29 = out29 + sqsub v25.4s, v25.4s, v28.4s // t26 + sqadd v28.4s, v24.4s, v19.4s // t28a = out28 + sqsub v24.4s, v24.4s, v19.4s // t27a + mov v19.16b, v4.16b // out19 + + mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 + mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 + srshr v20.4s, v4.4s, #12 // t20 + srshr v22.4s, v6.4s, #12 // t27 + + mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a + mov v27.16b, v22.16b // t27 + srshr v26.4s, v4.4s, #12 // t26a + + mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 + mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + srshr v21.4s, v6.4s, #12 // t21a + srshr v22.4s, v24.4s, #12 // t22 + srshr v25.4s, v4.4s, #12 // t25 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a + srshr v23.4s, v4.4s, #12 // t23a + srshr v24.4s, v6.4s, #12 // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + mov x14, x30 + movi v7.4s, #0 + lsl x8, x8, #1 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_4s_x16_neon + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 + transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 + +.macro store1 r0, r1, r2, r3 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + st1 {\r2}, [x6], #16 + st1 {\r3}, [x6], #16 +.endm + store1 v16.4s, v20.4s, v24.4s, v28.4s + store1 v17.4s, v21.4s, v25.4s, v29.4s + store1 v18.4s, v22.4s, v26.4s, v30.4s + store1 v19.4s, v23.4s, v27.4s, v31.4s +.purgem store1 + sub x6, x6, #64*4 + + movi v7.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.s[1] + scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_4s_x16_neon + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 + transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 +.macro store2 r0, r1, r2, r3, shift + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] + sqsub v4.4s, v0.4s, \r0 + sqadd v0.4s, v0.4s, \r0 + sqsub v5.4s, v1.4s, \r1 + sqadd v1.4s, v1.4s, \r1 + sqsub v6.4s, v2.4s, \r2 + sqadd v2.4s, v2.4s, \r2 + sqsub v7.4s, v3.4s, \r3 + sqadd v3.4s, v3.4s, \r3 + sqrshrn v0.4h, v0.4s, #\shift + sqrshrn2 v0.8h, v1.4s, #\shift + sqrshrn v1.4h, v2.4s, #\shift + sqrshrn2 v1.8h, v3.4s, #\shift + sqrshrn v2.4h, v7.4s, #\shift + sqrshrn2 v2.8h, v6.4s, #\shift + sqrshrn v3.4h, v5.4s, #\shift + sqrshrn2 v3.8h, v4.4s, #\shift + st1 {v0.8h, v1.8h}, [x6], #32 + rev64 v2.8h, v2.8h + rev64 v3.8h, v3.8h + st1 {v2.8h, v3.8h}, [x6], #32 +.endm + + store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift + store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift + store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift + store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift +.purgem store2 + br x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl X(inv_dct_8h_x16_neon) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl X(inv_dct32_odd_8h_x16_neon) + + neg x9, x8 + mov x10, x6 + movi v0.8h, #0 + mvni v1.8h, #0xfc, lsl #8 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8h}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8h}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8h}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + sqadd v5.8h, v5.8h, v2.8h + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + smax v2.8h, v5.8h, v0.8h + ld1 {v5.8h}, [x7], \stride + sqadd v6.8h, v6.8h, v3.8h + smin v2.8h, v2.8h, v1.8h + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8h}, [x6], x1 + ld1 {v2.8h}, [x10], x1 + smax v3.8h, v6.8h, v0.8h + sqadd v7.8h, v7.8h, v4.8h + smin v3.8h, v3.8h, v1.8h + srshr v5.8h, v5.8h, #4 + st1 {v3.8h}, [x6], x1 + smax v4.8h, v7.8h, v0.8h + sqadd v5.8h, v5.8h, v2.8h + smin v4.8h, v4.8h, v1.8h + st1 {v4.8h}, [x6], x1 + smax v2.8h, v5.8h, v0.8h + smin v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + br x14 +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + .short 10, 43, 75, 107, 139, 171, 203, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + movi v0.8h, #0 + movi v1.8h, #0 + movrel x13, eob_32x32, 2 + + mov x8, #4*32 +1: + mov w9, #0 + movrel x12, eob_32x32, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc + +.macro shift_16_regs op, shift +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movz w16, #2896*8, lsl #16 + movz w17, #2*(5793-4096)*8, lsl #16 + movi v0.4s, #0 + movi v1.4s, #0 + movrel x13, eob_16x32\hshort, 2 + + mov x8, #4*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + dup v2.2s, w16 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + mov v2.s[1], w17 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + +.if \w == 16 + // 16x32 + identity_4x16_shift1 v2.s[1] +.else + // 32x16 + shift_16_regs sqshl, 1 + identity_4x16 v2.s[1] +.endif + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #16 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movi v0.4s, #0 + movi v1.4s, #0 + // Working on 8x8 blocks, read every other entry from eob_8x32 + movrel x13, eob_8x32, 2 + + mov w8, #4*\h +1: + // Working on 8x8 blocks, read every other entry from eob_8x32 + ldrh w12, [x13], #4 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + +.if \w == 8 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn2 v16.8h, v17.4s, #1 + sqrshrn v17.4h, v18.4s, #1 + sqrshrn2 v17.8h, v19.4s, #1 + sqrshrn v18.4h, v20.4s, #1 + sqrshrn2 v18.8h, v21.4s, #1 + sqrshrn v19.4h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + sqrshrn v20.4h, v24.4s, #1 + sqrshrn2 v20.8h, v25.4s, #1 + sqrshrn v21.4h, v26.4s, #1 + sqrshrn2 v21.8h, v27.4s, #1 + sqrshrn v22.4h, v28.4s, #1 + sqrshrn2 v22.8h, v29.4s, #1 + sqrshrn v23.4h, v30.4s, #1 + sqrshrn2 v23.8h, v31.4s, #1 +.else + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #4*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_4s_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + movrel x13, eob_16x32 + movrel x5, X(inv_dct_8h_x16_neon) + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + mov x8, #4*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.4s, #0 + mov x8, #4*32 + mov w9, #32 + mov x6, sp + mov x7, x2 +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().4s}, [x7] + st1 {v28.4s}, [x7], x8 +.endr + ldrh w12, [x13], #2 + sub w9, w9, #4 + sub x7, x7, x8, lsl #3 + add x7, x7, #4*4 + + bl inv_dct_4s_x8_neon + + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + sqrshrn2 v16.8h, v20.4s, #2 + sqrshrn2 v17.8h, v21.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 + sqrshrn2 v19.8h, v23.4s, #2 + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + cmp w3, w12 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #4 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + +.irp i, 0, 4 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + cmp w3, #10 + b.lt 1f +.endif + mov x8, #8*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + +2: + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9, lsl #1 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + br x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.4s, v1.4s}, [x17], #32 + + sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a + sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a + sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a + sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a + sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a + sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a + sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a + sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a + + ld1 {v0.4s}, [x17], #16 + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t33 + sqsub v26.4s, v19.4s, v18.4s // t34 + sqadd v27.4s, v19.4s, v18.4s // t35 + sqadd v28.4s, v20.4s, v21.4s // t60 + sqsub v29.4s, v20.4s, v21.4s // t61 + sqsub v30.4s, v23.4s, v22.4s // t62 + sqadd v31.4s, v23.4s, v22.4s // t63 + + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a + mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + neg v2.4s, v2.4s // t34a + mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a + srshr v26.4s, v2.4s, #12 // t34a + mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a + srshr v29.4s, v4.4s, #12 // t61a + srshr v25.4s, v6.4s, #12 // t33a + srshr v30.4s, v2.4s, #12 // t62a + + sqadd v16.4s, v24.4s, v27.4s // t32a + sqsub v19.4s, v24.4s, v27.4s // t35a + sqadd v17.4s, v25.4s, v26.4s // t33 + sqsub v18.4s, v25.4s, v26.4s // t34 + sqsub v20.4s, v31.4s, v28.4s // t60a + sqadd v23.4s, v31.4s, v28.4s // t63a + sqsub v21.4s, v30.4s, v29.4s // t61 + sqadd v22.4s, v30.4s, v29.4s // t62 + + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a + mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 + srshr v21.4s, v2.4s, #12 // t61a + srshr v18.4s, v4.4s, #12 // t34a + mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 + srshr v20.4s, v6.4s, #12 // t60 + srshr v19.4s, v2.4s, #12 // t35 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #4*4*0] // t32a + ldr q17, [x9, #4*4*8] // t39a + ldr q18, [x9, #4*4*0] // t63a + ldr q19, [x6, #4*4*8] // t56a + ldr q20, [x6, #4*4*16] // t40a + ldr q21, [x9, #4*4*24] // t47a + ldr q22, [x9, #4*4*16] // t55a + ldr q23, [x6, #4*4*24] // t48a + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t39 + sqadd v26.4s, v18.4s, v19.4s // t63 + sqsub v27.4s, v18.4s, v19.4s // t56 + sqsub v28.4s, v21.4s, v20.4s // t40 + sqadd v29.4s, v21.4s, v20.4s // t47 + sqadd v30.4s, v23.4s, v22.4s // t48 + sqsub v31.4s, v23.4s, v22.4s // t55 + + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a + mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a + srshr v25.4s, v2.4s, #12 // t56a + srshr v27.4s, v4.4s, #12 // t39a + neg v6.4s, v6.4s // t40a + mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a + srshr v31.4s, v6.4s, #12 // t40a + srshr v28.4s, v2.4s, #12 // t55a + + sqadd v16.4s, v24.4s, v29.4s // t32a + sqsub v19.4s, v24.4s, v29.4s // t47a + sqadd v17.4s, v27.4s, v31.4s // t39 + sqsub v18.4s, v27.4s, v31.4s // t40 + sqsub v20.4s, v26.4s, v30.4s // t48a + sqadd v23.4s, v26.4s, v30.4s // t63a + sqsub v21.4s, v25.4s, v28.4s // t55 + sqadd v22.4s, v25.4s, v28.4s // t56 + + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a + mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 + srshr v18.4s, v2.4s, #12 // t40a + srshr v21.4s, v4.4s, #12 // t55a + mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 + srshr v19.4s, v6.4s, #12 // t47 + srshr v20.4s, v2.4s, #12 // t48 + + str q16, [x6, #4*4*0] // t32a + str q17, [x9, #4*4*0] // t39 + str q18, [x6, #4*4*8] // t40a + str q19, [x9, #4*4*8] // t47 + str q20, [x6, #4*4*16] // t48 + str q21, [x9, #4*4*16] // t55a + str q22, [x6, #4*4*24] // t56 + str q23, [x9, #4*4*24] // t63a + + add x6, x6, #4*4 + sub x9, x9, #4*4 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movz16dup_if reg, gpr, val, cond +.if \cond + movz \gpr, \val, lsl #16 + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4s_x64_neon + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_4s_x16_neon + + store16 x6 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_4s_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.4s}, [x6], #16 + ld1 {v3.4s}, [x6], #16 + sqadd v6.4s, v2.4s, \r0 + sqsub \r0, v2.4s, \r0 + ld1 {v4.4s}, [x6], #16 + sqadd v7.4s, v3.4s, \r1 + sqsub \r1, v3.4s, \r1 + ld1 {v5.4s}, [x6], #16 + sqadd v2.4s, v4.4s, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.4s, \r2 + st1 {v6.4s}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.4s, v5.4s, \r3 + sqsub \r3, v5.4s, \r3 + st1 {v7.4s}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.4s}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.4s}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.4s, v30.4s, v29.4s, v28.4s + store_addsub v27.4s, v26.4s, v25.4s, v24.4s + store_addsub v23.4s, v22.4s, v21.4s, v20.4s + store_addsub v19.4s, v18.4s, v17.4s, v16.4s +.purgem store_addsub + + add x6, x6, #4*4*16 + + movrel x17, idct64_coeffs + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.4s}, [x7] // in1 (offset 0) + ld1 {v17.4s}, [x9] // in31 (offset 15) + ld1 {v18.4s}, [x10] // in17 (offset 8) + ld1 {v19.4s}, [x11] // in15 (offset 7) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.4s}, [x10] // in7 (offset 3) + ld1 {v17.4s}, [x11] // in25 (offset 12) + ld1 {v18.4s}, [x9] // in23 (offset 11) + ld1 {v19.4s}, [x7] // in9 (offset 4) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #4*4*32 + add x9, x6, #4*4*7 + + bl inv_dct64_step2_neon + + br x14 +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x4_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #4*4*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-4*4*4 + + dup v7.4s, w12 +1: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.4s, \src0, \src1 + sqadd v0.4s, \src0, \src1 + sqsub v3.4s, \src2, \src3 + srshl v1.4s, v1.4s, v7.4s + sqadd v2.4s, \src2, \src3 + srshl v3.4s, v3.4s, v7.4s + srshl v0.4s, v0.4s, v7.4s + srshl v2.4s, v2.4s, v7.4s + sqxtn v3.4h, v3.4s + sqxtn2 v3.8h, v1.4s + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v2.4s + rev64 v3.8h, v3.8h + st1 {v0.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.4s, v31.4s, v20.4s, v27.4s + store_addsub v17.4s, v30.4s, v21.4s, v26.4s + store_addsub v18.4s, v29.4s, v22.4s, v25.4s + store_addsub v19.4s, v28.4s, v23.4s, v24.4s +.purgem store_addsub + sub x6, x6, x10, lsl #2 + sub x9, x9, x10, lsl #2 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + br x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8h}, [x6], x1 + ld1 {v1.8h}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8h}, [x6] + sqsub \src0, \src0, \src1 + ld1 {v3.8h}, [x9] + sqadd v5.8h, \src2, \src3 + sqsub \src2, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr \src0, \src0, #4 + sqadd v0.8h, v0.8h, v4.8h + srshr \src2, \src2, #4 + sqadd v1.8h, v1.8h, \src0 + sqadd v2.8h, v2.8h, v5.8h + smax v0.8h, v0.8h, v6.8h + sqadd v3.8h, v3.8h, \src2 + smax v1.8h, v1.8h, v6.8h + smin v0.8h, v0.8h, v7.8h + smax v2.8h, v2.8h, v6.8h + smin v1.8h, v1.8h, v7.8h + st1 {v0.8h}, [x6], x1 + smax v3.8h, v3.8h, v6.8h + smin v2.8h, v2.8h, v7.8h + st1 {v1.8h}, [x9], x10 + smin v3.8h, v3.8h, v7.8h + st1 {v2.8h}, [x6], x1 + st1 {v3.8h}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + br x14 +endfunc + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*4*4 + add x4, sp, #64*4*4 + + movrel x13, eob_16x32 + +.irp i, 0, 4, 8, 12 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + movrel x5, X(inv_dct_8h_x16_neon) +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_4s_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + br x15 +endfunc diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S index 31cc46f8971..3a6cf900a97 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/msac.S @@ -118,9 +118,9 @@ endconst .endm .macro str_n idx0, idx1, dstreg, dstoff, n - str q\idx0, [\dstreg, \dstoff] + str \idx0, [\dstreg, \dstoff] .if \n == 16 - str q\idx1, [\dstreg, \dstoff + 16] + str \idx1, [\dstreg, \dstoff + 16] .endif .endm @@ -150,7 +150,7 @@ function msac_decode_symbol_adapt4_neon, export=1 ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) movrel x8, bits - str_n 4, 5, sp, #16, \n // store v values to allow indexed access + str_n q4, q5, sp, #16, \n // store v values to allow indexed access ld1_n v16, v17, x8, .8h, \n @@ -185,7 +185,7 @@ function msac_decode_symbol_adapt4_neon, export=1 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) .endif sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) - dup v6.8h, w4 // -rate + dup v6\sz, w4 // -rate sub w3, w3, w3, lsr #5 // count - (count == 32) sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) @@ -216,7 +216,7 @@ L(renorm2): lsl x7, x7, x5 // (~dif + (v << 48)) << d str w4, [x0, #RNG] mvn x7, x7 // ~dif - b.ge 9f + b.hs 9f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END @@ -274,6 +274,128 @@ function msac_decode_symbol_adapt16_neon, export=1 b L(renorm) endfunc +function msac_decode_hi_tok_neon, export=1 + ld1 {v0.4h}, [x1] // cdf + add x16, x0, #RNG + movi v31.4h, #0x7f, lsl #8 // 0x7f00 + movrel x17, coeffs, 30-2*3 + mvni v30.4h, #0x3f // 0xffc0 + ldrh w9, [x1, #6] // count = cdf[n_symbols] + ld1r {v3.4h}, [x16] // rng + movrel x16, bits + ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) + add x17, x0, #DIF + 6 + ld1 {v16.8h}, [x16] + mov w13, #-24 + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + ldr w10, [x0, #ALLOW_UPDATE_CDF] + ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] +1: + and v7.8b, v3.8b, v31.8b // rng & 0x7f00 + sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + str h3, [sp, #14] // store original u = s->rng + cmhs v2.8h, v1.8h, v4.8h // c >= v + str q4, [sp, #16] // store v values to allow indexed access + and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask + addv h6, v6.8h // Aggregate mask bits + umov w3, v6.h[0] + add w13, w13, #5 + rbit w3, w3 + add x8, sp, #16 + clz w15, w3 // ret + + cbz w10, 2f + // update_cdf + movi v5.8b, #0xff + mov w4, #-5 + urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) + sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6.4h, w4 // -rate + + sub w9, w9, w9, lsr #5 // count - (count == 32) + sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) + sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + add w9, w9, #1 // count + (count < 32) + add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + st1 {v0.4h}, [x1] + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + strh w9, [x1, #6] + +2: + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + dup v3.4h, w4 + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + lsl w15, w15, #1 + sub w15, w15, #5 + lsr x12, x7, #48 + adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 + dup v1.8h, w12 + b.cc 1b // loop if !carry + add w13, w13, #30 + str w6, [x0, #CNT] + add sp, sp, #48 + str x7, [x0, #DIF] + lsr w0, w13, #1 + ret +endfunc + function msac_decode_bool_equi_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT sub sp, sp, #48 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S index 3332c85223d..fc0e0d04f1c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/util.S @@ -170,6 +170,18 @@ trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm +.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4s, \r0\().4s, \r1\().4s + trn2 \t5\().4s, \r0\().4s, \r1\().4s + trn1 \t6\().4s, \r2\().4s, \r3\().4s + trn2 \t7\().4s, \r2\().4s, \r3\().4s + + trn1 \r0\().2d, \t4\().2d, \t6\().2d + trn2 \r2\().2d, \t4\().2d, \t6\().2d + trn1 \r1\().2d, \t5\().2d, \t7\().2d + trn2 \r3\().2d, \t5\().2d, \t7\().2d +.endm + .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S index 6b1d46fcd81..1cd0955d4e9 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S @@ -93,6 +93,7 @@ .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function + .hidden EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name @@ -109,7 +110,7 @@ EXTERN\name: \name: .endm -.macro const name, align=2 +.macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name @@ -124,6 +125,13 @@ EXTERN\name: .const_data #endif .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#endif +EXTERN\name: + .endif \name: .endm @@ -135,4 +143,9 @@ EXTERN\name: #define X(x) CONCAT(EXTERN, x) +#if ARCH_AARCH64 +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 +#endif + #endif /* DAV1D_SRC_ARM_ASM_S */ diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c index f9c68e9eb75..ad418f2db59 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx_init_tmpl.c @@ -29,32 +29,32 @@ #include "src/itx.h" #define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx12_fns(w, h, opt) \ decl_itx2_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) #define decl_itx16_fns(w, h, opt) \ decl_itx12_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) #define decl_itx17_fns(w, h, opt) \ decl_itx16_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) decl_itx17_fns( 4, 4, neon); decl_itx16_fns( 4, 8, neon); @@ -71,16 +71,16 @@ decl_itx2_fns (32, 8, neon); decl_itx2_fns (32, 16, neon); decl_itx2_fns (32, 32, neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_neon); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); -COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - dav1d_inv_txfm_add_##type##_##w##x##h##_##ext + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) #define assign_itx1_fn(pfx, w, h, ext) \ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) @@ -117,7 +117,9 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 && ARCH_AARCH64 + if (bpc > 10) return; + +#if ARCH_AARCH64 || BITDEPTH == 8 assign_itx17_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/msac.h b/chromium/third_party/dav1d/libdav1d/src/arm/msac.h index a243a06295d..9db0bf86aea 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/msac.h +++ b/chromium/third_party/dav1d/libdav1d/src/arm/msac.h @@ -34,14 +34,16 @@ unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, size_t n_symbols); +unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); -#if ARCH_AARCH64 +#if ARCH_AARCH64 || defined(__ARM_NEON) #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon #define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon #define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon #define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon diff --git a/chromium/third_party/dav1d/libdav1d/src/decode.c b/chromium/third_party/dav1d/libdav1d/src/decode.c index a5646c648e6..f6782153c14 100644 --- a/chromium/third_party/dav1d/libdav1d/src/decode.c +++ b/chromium/third_party/dav1d/libdav1d/src/decode.c @@ -3302,7 +3302,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { #define assign_bitdepth_case(bd) \ dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \ dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \ - dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \ + dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \ dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \ dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \ dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \ diff --git a/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm b/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm index a6a8fb7c6b8..c252e5451be 100644 --- a/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm +++ b/chromium/third_party/dav1d/libdav1d/src/ext/x86/x86inc.asm @@ -358,7 +358,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) +%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 @@ -403,7 +403,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro -%macro SETUP_STACK_POINTER 1 +%macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.c b/chromium/third_party/dav1d/libdav1d/src/getbits.c index c185053bd98..7bb20140e41 100644 --- a/chromium/third_party/dav1d/libdav1d/src/getbits.c +++ b/chromium/third_party/dav1d/libdav1d/src/getbits.c @@ -27,6 +27,8 @@ #include "config.h" +#include <limits.h> + #include "common/intops.h" #include "src/getbits.h" @@ -34,6 +36,8 @@ void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data, const size_t sz) { + // If sz were 0, c->eof would need to be initialized to 1. + assert(sz); c->ptr = c->ptr_start = data; c->ptr_end = &c->ptr_start[sz]; c->bits_left = 0; @@ -77,25 +81,23 @@ int dav1d_get_sbits(GetBits *const c, const unsigned n) { return res >> shift; } -unsigned dav1d_get_uleb128(GetBits *c) { - unsigned val = 0, more, i = 0; +unsigned dav1d_get_uleb128(GetBits *const c) { + uint64_t val = 0; + unsigned i = 0, more; do { - more = dav1d_get_bits(c, 1); - unsigned bits = dav1d_get_bits(c, 7); - if (i <= 3 || (i == 4 && bits < (1 << 4))) - val |= bits << (i * 7); - else if (bits) { - c->error = 1; - return 0; - } - if (more && ++i == 8) { - c->error = 1; - return 0; - } - } while (more); + const int v = dav1d_get_bits(c, 8); + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << i; + i += 7; + } while (more && i < 56); + + if (val > UINT_MAX || more) { + c->error = 1; + return 0; + } - return val; + return (unsigned) val; } unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { diff --git a/chromium/third_party/dav1d/libdav1d/src/itx.h b/chromium/third_party/dav1d/libdav1d/src/itx.h index 3befc420994..a299629c5cd 100644 --- a/chromium/third_party/dav1d/libdav1d/src/itx.h +++ b/chromium/third_party/dav1d/libdav1d/src/itx.h @@ -43,8 +43,8 @@ typedef struct Dav1dInvTxfmDSPContext { itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL]; } Dav1dInvTxfmDSPContext; -bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c); -bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c); +bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); +bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c); #endif /* DAV1D_SRC_ITX_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c index 02f34e85c92..a0e807f9550 100644 --- a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c @@ -180,7 +180,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, dst[x] = iclip_pixel(dst[x] + *c++); } -COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_all_fn64(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ inv_txfm_add_dct_dct_##w##x##h##_c @@ -224,8 +224,6 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \ inv_txfm_add_identity_adst_##w##x##h##_c; \ - memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */ - c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c; assign_itx_all_fn84( 4, 4, ); assign_itx_all_fn84( 4, 8, R); @@ -249,7 +247,7 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_itx_dsp_init_arm)(c); + bitfn(dav1d_itx_dsp_init_arm)(c, bpc); #endif #if ARCH_X86 bitfn(dav1d_itx_dsp_init_x86)(c); diff --git a/chromium/third_party/dav1d/libdav1d/src/log.c b/chromium/third_party/dav1d/libdav1d/src/log.c index 999e3a2e8a0..de6776a617e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/log.c +++ b/chromium/third_party/dav1d/libdav1d/src/log.c @@ -36,13 +36,13 @@ #include "src/internal.h" #include "src/log.h" +#if CONFIG_LOG COLD void dav1d_log_default_callback(void *const cookie, const char *const format, va_list ap) { vfprintf(stderr, format, ap); } -#if CONFIG_LOG COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { validate_input(c != NULL); diff --git a/chromium/third_party/dav1d/libdav1d/src/log.h b/chromium/third_party/dav1d/libdav1d/src/log.h index 8f6357cb660..df32de7f253 100644 --- a/chromium/third_party/dav1d/libdav1d/src/log.h +++ b/chromium/third_party/dav1d/libdav1d/src/log.h @@ -35,12 +35,12 @@ #include "common/attributes.h" -void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); - #if CONFIG_LOG #define dav1d_log dav1d_log +void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3); #else +#define dav1d_log_default_callback NULL #define dav1d_log(...) do { } while(0) #endif diff --git a/chromium/third_party/dav1d/libdav1d/src/meson.build b/chromium/third_party/dav1d/libdav1d/src/meson.build index 1a7114a870e..fd8ad0269c1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/meson.build +++ b/chromium/third_party/dav1d/libdav1d/src/meson.build @@ -102,6 +102,8 @@ if is_asm_enabled ) if host_machine.cpu_family() == 'aarch64' libdav1d_sources += files( + # itx.S is used for both 8 and 16 bpc. + 'arm/64/itx.S', 'arm/64/looprestoration_common.S', 'arm/64/msac.S', ) @@ -110,7 +112,6 @@ if is_asm_enabled libdav1d_sources += files( 'arm/64/cdef.S', 'arm/64/ipred.S', - 'arm/64/itx.S', 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', 'arm/64/mc.S', @@ -121,6 +122,7 @@ if is_asm_enabled libdav1d_sources += files( 'arm/64/cdef16.S', 'arm/64/ipred16.S', + 'arm/64/itx16.S', 'arm/64/loopfilter16.S', 'arm/64/looprestoration16.S', 'arm/64/mc16.S', @@ -128,12 +130,14 @@ if is_asm_enabled endif elif host_machine.cpu_family().startswith('arm') libdav1d_sources += files( + 'arm/32/msac.S', ) if dav1d_bitdepths.contains('8') libdav1d_sources += files( 'arm/32/cdef.S', 'arm/32/ipred.S', + 'arm/32/itx.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', 'arm/32/mc.S', @@ -149,14 +153,9 @@ if is_asm_enabled libdav1d_sources += files( 'x86/cpu.c', + 'x86/msac_init.c', ) - if host_machine.cpu_family() == 'x86_64' - libdav1d_sources += files( - 'x86/msac_init.c', - ) - endif - libdav1d_tmpl_sources += files( 'x86/cdef_init_tmpl.c', 'x86/film_grain_init_tmpl.c', @@ -189,7 +188,7 @@ if is_asm_enabled 'x86/itx_ssse3.asm', 'x86/loopfilter_ssse3.asm', 'x86/looprestoration_ssse3.asm', - 'x86/mc_ssse3.asm', + 'x86/mc_sse.asm', ) endif diff --git a/chromium/third_party/dav1d/libdav1d/src/msac.c b/chromium/third_party/dav1d/libdav1d/src/msac.c index afd42543081..8195977d578 100644 --- a/chromium/third_party/dav1d/libdav1d/src/msac.c +++ b/chromium/third_party/dav1d/libdav1d/src/msac.c @@ -198,12 +198,11 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, s->rng = 0x8000; s->cnt = -15; s->allow_update_cdf = !disable_cdf_update_flag; + ctx_refill(s); #if ARCH_X86_64 && HAVE_ASM s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; dav1d_msac_init_x86(s); #endif - - ctx_refill(s); } diff --git a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c index 457d9712497..8e96f8e16ad 100644 --- a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c @@ -1071,15 +1071,15 @@ static int warp_affine(Dav1dTileContext *const t, const int height = (refp->p.p.h + ss_ver) >> ss_ver; for (int y = 0; y < b_dim[1] * v_mul; y += 8) { + const int src_y = t->by * 4 + ((y + 4) << ss_ver); + const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; + const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; for (int x = 0; x < b_dim[0] * h_mul; x += 8) { // calculate transformation relative to center of 8x8 block in // luma pixel units const int src_x = t->bx * 4 + ((x + 4) << ss_hor); - const int src_y = t->by * 4 + ((y + 4) << ss_ver); - const int64_t mvx = ((int64_t) mat[2] * src_x + - (int64_t) mat[3] * src_y + mat[0]) >> ss_hor; - const int64_t mvy = ((int64_t) mat[4] * src_x + - (int64_t) mat[5] * src_y + mat[1]) >> ss_ver; + const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; + const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dx = (int) (mvx >> 16) - 4; const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 - diff --git a/chromium/third_party/dav1d/libdav1d/src/refmvs.c b/chromium/third_party/dav1d/libdav1d/src/refmvs.c index 2039bed4fe4..1e113b4eacf 100644 --- a/chromium/third_party/dav1d/libdav1d/src/refmvs.c +++ b/chromium/third_party/dav1d/libdav1d/src/refmvs.c @@ -182,10 +182,13 @@ static inline union mv mv_projection(const union mv mv, const int num, const int }; assert(den > 0 && den < 32); assert(num > -32 && num < 32); - const int dm = div_mult[den]; - const int y = mv.y * num * dm, x = mv.x * num * dm; - return (union mv) { .y = (y + 8192 + (y >> 31)) >> 14, - .x = (x + 8192 + (x >> 31)) >> 14 }; + const int frac = num * div_mult[den]; + const int y = mv.y * frac, x = mv.x * frac; + // Round and clip according to AV1 spec section 7.9.3 + return (union mv) { // 0x3fff == (1 << 14) - 1 + .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff), + .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff) + }; } static void add_temporal_candidate(const refmvs_frame *const rf, diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.c b/chromium/third_party/dav1d/libdav1d/src/tables.c index c0466193fa8..30d9fa6ae1a 100644 --- a/chromium/third_party/dav1d/libdav1d/src/tables.c +++ b/chromium/third_party/dav1d/libdav1d/src/tables.c @@ -442,7 +442,7 @@ const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = { 0 }; -const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { +const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = { [DAV1D_FILTER_8TAP_REGULAR] = { { 0, 1, -3, 63, 4, -1, 0, 0 }, { 0, 1, -5, 61, 9, -2, 0, 0 }, @@ -524,6 +524,27 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { { 0, 0, 2, 20, 31, 11, 0, 0 }, { 0, 0, 2, 18, 31, 13, 0, 0 }, { 0, 0, 1, 17, 31, 15, 0, 0 } +#if ARCH_X86_64 + /* Bilin scaled being very rarely used, add a new table entry + * and use the put/prep_8tap_scaled code, thus acting as a + * scaled bilinear filter. */ + }, [5] = { + { 0, 0, 0, 60, 4, 0, 0, 0 }, + { 0, 0, 0, 56, 8, 0, 0, 0 }, + { 0, 0, 0, 52, 12, 0, 0, 0 }, + { 0, 0, 0, 48, 16, 0, 0, 0 }, + { 0, 0, 0, 44, 20, 0, 0, 0 }, + { 0, 0, 0, 40, 24, 0, 0, 0 }, + { 0, 0, 0, 36, 28, 0, 0, 0 }, + { 0, 0, 0, 32, 32, 0, 0, 0 }, + { 0, 0, 0, 28, 36, 0, 0, 0 }, + { 0, 0, 0, 24, 40, 0, 0, 0 }, + { 0, 0, 0, 20, 44, 0, 0, 0 }, + { 0, 0, 0, 16, 48, 0, 0, 0 }, + { 0, 0, 0, 12, 52, 0, 0, 0 }, + { 0, 0, 0, 8, 56, 0, 0, 0 }, + { 0, 0, 0, 4, 60, 0, 0, 0 } +#endif } }; diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.h b/chromium/third_party/dav1d/libdav1d/src/tables.h index 8d2d8456cd9..abcf26592f0 100644 --- a/chromium/third_party/dav1d/libdav1d/src/tables.h +++ b/chromium/third_party/dav1d/libdav1d/src/tables.h @@ -110,7 +110,7 @@ extern const int8_t dav1d_cdef_directions[12][2]; extern const int16_t dav1d_sgr_params[16][4]; extern const uint8_t dav1d_sgr_x_by_x[256]; -extern const int8_t dav1d_mc_subpel_filters[5][15][8]; +extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8]; extern const int8_t dav1d_mc_warp_filter[193][8]; extern const int8_t dav1d_resize_filter[64][8]; diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm index 20ac75fff0a..643caa0cf99 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef_avx2.asm @@ -459,14 +459,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd @@ -1468,14 +1468,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .border_sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .border_pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm index e73c09ed829..ad05b3b1fdc 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.asm @@ -1412,7 +1412,6 @@ ALIGN function_align mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases pcmpgtb m1, m2 pmovmskb r5d, m1 - popcnt r5d, r5d ; sets ZF which can be used by caller ret .w4_no_upsample: %assign stack_offset org_stack_offset @@ -1423,7 +1422,9 @@ ALIGN function_align lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 + test r5d, r5d jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m7, [base+pb_8] vbroadcasti128 m2, [tlq-1] pminub m1, m7, [base+z_filter_s] @@ -1596,7 +1597,9 @@ ALIGN function_align test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .w8_main ; filter_strength == 0 + popcnt r5d, r5d movu xm2, [tlq] pminub xm1, xm0, [base+z_filter_s+14] vinserti128 m2, [tlq-1], 1 @@ -1698,7 +1701,9 @@ ALIGN function_align test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .w16_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m1, [base+pb_12] vbroadcasti128 m6, [base+z_filter_s+8] vinserti128 m2, m6, [base+z_filter_s], 0 @@ -2205,7 +2210,6 @@ ALIGN function_align pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d ret ALIGN function_align .upsample_above: ; w4/w8 @@ -2255,7 +2259,9 @@ ALIGN function_align lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength + test r3d, r3d jz .w4_no_filter_above + popcnt r3d, r3d vpbroadcastd xm2, [base+pb_4] pminub xm2, [base+z_filter_s] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] @@ -2290,9 +2296,10 @@ ALIGN function_align pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 - popcnt r3d, r3d .w4_filter_left: + test r3d, r3d jz .w4_main + popcnt r3d, r3d mov r5d, 10 cmp hd, 16 movu xm2, [rsp+49] @@ -2443,7 +2450,9 @@ ALIGN function_align lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength + test r3d, r3d jz .w8_no_filter_above + popcnt r3d, r3d vpbroadcastd xm3, [base+pb_8] pminub xm3, [base+z_filter_s+8] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] @@ -2476,9 +2485,10 @@ ALIGN function_align pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d .w8_filter_left: + test r3d, r3d jz .w8_main + popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] @@ -2650,7 +2660,9 @@ ALIGN function_align lea r3d, [hq+15] sub angled, 90 call .filter_strength + test r3d, r3d jz .w16_no_filter_above + popcnt r3d, r3d vbroadcasti128 m6, [tlq+1] mova xm2, [base+z_filter_s] vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de @@ -2683,8 +2695,9 @@ ALIGN function_align pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d + test r3d, r3d jz .w16_main + popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] @@ -3086,7 +3099,6 @@ ALIGN function_align mova xm2, [r4+angleq*8] pcmpgtb m1, m2 pmovmskb r5d, m1 - popcnt r5d, r5d ret .h4_no_upsample: %assign stack_offset org_stack_offset @@ -3097,7 +3109,9 @@ ALIGN function_align lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 + test r5d, r5d jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m7, [base+pb_7] vbroadcasti128 m2, [tlq-14] pmaxub m1, m7, [base+z_filter_s-4] @@ -3288,7 +3302,9 @@ ALIGN function_align test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .h8_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd xm6, [base+pb_15] pcmpeqb xm1, xm1 psubusb xm6, xm0 @@ -3444,7 +3460,9 @@ ALIGN function_align test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m11, [base+pb_27] vpbroadcastd m1, [base+pb_1] vbroadcasti128 m6, [base+z_filter_s+12] diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm index c78c1531dd2..f27b90032f3 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.asm @@ -884,7 +884,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd m5, [o(pw_2896_2896)] ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 vpbroadcastd m0, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 shufps m5, m4, m2, q1032 ; t12 t13a @@ -1170,7 +1170,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 paddsw m3, m8 jmp m(iadst_4x16_internal).end2 -%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3] +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm index 801c3d721fe..3e3c35c34af 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.asm @@ -51,9 +51,12 @@ cextern sgr_x_by_x SECTION .text INIT_YMM avx2 -cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge +cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge + mov edged, edgem vpbroadcastb m15, [fhq+0] + movifnidn wd, wm vpbroadcastb m14, [fhq+2] + mov hd, hm vpbroadcastb m13, [fhq+4] vpbroadcastw m12, [fhq+6] vpbroadcastd m11, [pw_2048] @@ -64,7 +67,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge ; if (edge & has_right) align_w_to_32 ; else w -= 32, and use that as limit in x loop - test edged, 2 ; has_right + test edgeb, 2 ; has_right jnz .align mov xlimq, -3 jmp .loop @@ -80,7 +83,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge lea xq, [wq+xlimq] ; load left edge pixels - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .emu_left test leftq, leftq ; left == NULL for the edge-extended bottom/top jz .load_left_combined @@ -203,17 +206,19 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge jg .loop RET -cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge - vpbroadcastd m14, [fvq+4] - vpbroadcastd m15, [fvq] - vpbroadcastd m13, [pw_0_128] - paddw m14, m13 +cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge + movifnidn fvq, fvmp + mov edged, edgem + movifnidn hd, hm + vpbroadcastd m10, [fvq] + vpbroadcastd m11, [fvq+4] + vpbroadcastd m0, [pw_0_128] vpbroadcastd m12, [pd_1024] DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + paddw m11, m0 + and ylimd, 2 ; have_bottom sub ylimd, 3 ; main x loop for vertical filter, does one column of 16 pixels @@ -221,7 +226,7 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge mova m3, [midq] ; middle line ; load top pixels - test edged, 4 ; have_top + test edgeb, 4 ; have_top jz .emu_top mova m0, [midq-384*4] mova m2, [midq-384*2] @@ -276,27 +281,28 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge ; try to structure the loop so that the common case is evaluated fastest mova m6, [mptrq+384*6] .loop: - paddw m7, m0, m6 - paddw m8, m1, m5 - paddw m9, m2, m4 - punpcklwd m10, m7, m8 - punpckhwd m7, m8 - punpcklwd m11, m9, m3 - punpckhwd m9, m3 - pmaddwd m10, m15 - pmaddwd m7, m15 - pmaddwd m11, m14 - pmaddwd m9, m14 - paddd m10, m11 + paddw m0, m6 + paddw m7, m1, m5 + paddw m8, m2, m4 + punpcklwd m9, m0, m7 + punpckhwd m0, m7 + punpcklwd m7, m8, m3 + punpckhwd m8, m3 + pmaddwd m9, m10 + pmaddwd m0, m10 + pmaddwd m7, m11 + pmaddwd m8, m11 + add mptrq, 384*2 paddd m7, m9 - paddd m10, m12 + paddd m0, m8 paddd m7, m12 - psrad m10, 11 + paddd m0, m12 psrad m7, 11 - packssdw m10, m7 - packuswb m10, m10 - vpermq m10, m10, q3120 - mova [dstptrq], xm10 + psrad m0, 11 + packssdw m7, m0 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + mova [dstptrq], xm7 ; shift pixels one position mova m0, m1 mova m1, m2 @@ -305,51 +311,51 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge mova m4, m5 mova m5, m6 add dstptrq, strideq - add mptrq, 384*2 dec yd jg .loop_load ; for the bottom pixels, continue using m6 (as extended edge) cmp yd, ylimd jg .loop - - add dstq, 16 add midq, 32 + add dstq, 16 sub wd, 16 jg .loop_x RET INIT_YMM avx2 -cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov xlimd, edged +cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov xlimd, edgem + movifnidn wd, wm + mov hd, hm + mov edged, xlimd and xlimd, 2 ; have_right - add wd, xlimd - xor xlimd, 2 ; 2*!have_right - jnz .no_right - add wd, 15 + jz .no_right + add wd, 2+15 and wd, ~15 .no_right: + lea r10, [pb_right_ext_mask+32] + xor xlimd, 2 ; 2*!have_right pxor m1, m1 - lea srcq, [srcq+wq] + add srcq, wq lea sumq, [sumq+wq*2-2] lea sumsqq, [sumsqq+wq*4-4] neg wq - lea r10, [pb_right_ext_mask+32] .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main - pinsrw xm0, [leftq+2], 7 + vpbroadcastw xm0, [leftq+2] add leftq, 4 jmp .expand_x .no_left: vpbroadcastb xm0, [srcq+xq] jmp .expand_x .load_left_from_main: - pinsrw xm0, [srcq+xq-2], 7 + vpbroadcastw xm0, [srcq+xq-2] .expand_x: punpckhbw xm0, xm1 @@ -359,8 +365,8 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim .partial_load_and_extend: vpbroadcastb m3, [srcq-1] pmovzxbw m2, [srcq+xq] - punpcklbw m3, m1 movu m4, [r10+xq*2] + punpcklbw m3, m1 pand m2, m4 pandn m4, m3 por m2, m4 @@ -380,22 +386,21 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim punpcklwd m5, m3, m2 punpckhwd m6, m3, m2 paddw m3, m4 - punpcklwd m7, m4, m1 + punpcklwd m0, m4, m1 punpckhwd m4, m1 pmaddwd m5, m5 pmaddwd m6, m6 - pmaddwd m7, m7 + pmaddwd m0, m0 pmaddwd m4, m4 - paddd m5, m7 - paddd m6, m4 paddw m3, m2 + paddd m5, m0 + vextracti128 xm0, m2, 1 + paddd m6, m4 movu [sumq+xq*2], m3 - movu [sumsqq+xq*4+ 0], xm5 - movu [sumsqq+xq*4+16], xm6 + movu [sumsqq+xq*4+ 0], xm5 + movu [sumsqq+xq*4+16], xm6 vextracti128 [sumsqq+xq*4+32], m5, 1 vextracti128 [sumsqq+xq*4+48], m6, 1 - - vextracti128 xm0, m2, 1 add xq, 16 ; if x <= -16 we can reload more pixels @@ -418,25 +423,25 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim RET INIT_YMM avx2 -cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim +cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem mov xq, -2 - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom sub ylimd, 2 ; -2 if have_bottom=0, else 0 .loop_x: lea yd, [hq+ylimq+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m6, [sum_ptrq+(384+16)*2*1] mova m2, m0 mova m3, m1 mova m4, m0 mova m5, m1 - movu m6, [sum_ptrq+(384+16)*2*1] mova m7, m6 mova m8, m6 jmp .loop_y_noload @@ -550,8 +555,10 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s RET INIT_YMM avx2 -cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \ +cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm vpbroadcastd m15, [pw_16] xor xd, xd .loop_x: @@ -654,75 +661,83 @@ cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \ RET INIT_YMM avx2 -cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt - movd xm0, wtd - vpbroadcastw m0, xm0 - psllw m0, 4 +cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt +%ifidn wtd, wtm + shl wtd, 4 + movd xm5, wtd + vpbroadcastw m5, xm5 +%else + vpbroadcastw m5, wtm + mov hd, hm + psllw m5, 4 +%endif DEFINE_ARGS dst, stride, t, w, h, idx .loop_y: xor idxd, idxd .loop_x: - mova m1, [tq+idxq*2+ 0] - mova m4, [tq+idxq*2+32] + mova m0, [tq+idxq*2+ 0] + mova m1, [tq+idxq*2+32] pmovzxbw m2, [dstq+idxq+ 0] - pmovzxbw m5, [dstq+idxq+16] - psllw m3, m2, 4 - psllw m6, m5, 4 - psubw m1, m3 - psubw m4, m6 - pmulhrsw m1, m0 - pmulhrsw m4, m0 - paddw m1, m2 - paddw m4, m5 - packuswb m1, m4 - vpermq m1, m1, q3120 - mova [dstq+idxq], m1 + pmovzxbw m3, [dstq+idxq+16] + psllw m4, m2, 4 + psubw m0, m4 + psllw m4, m3, 4 + psubw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+idxq], m0 add idxd, 32 cmp idxd, wd jl .loop_x + add tq, 384*2 add dstq, strideq - add tq, 384 * 2 dec hd jg .loop_y RET INIT_YMM avx2 -cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim - test edged, 2 ; have_right +cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + test edgeb, 2 ; have_right jz .no_right xor xlimd, xlimd - add wd, 2 - add wd, 15 + add wd, 2+15 and wd, ~15 jmp .right_done .no_right: mov xlimd, 3 sub wd, 1 .right_done: + lea r10, [pb_right_ext_mask+32] pxor m1, m1 lea srcq, [srcq+wq+1] lea sumq, [sumq+wq*2-2] lea sumsqq, [sumsqq+wq*4-4] neg wq - lea r10, [pb_right_ext_mask+32] .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main - movd xm0, [leftq] - pinsrd xm0, [srcq+xq-1], 1 - pslldq xm0, 11 + vpbroadcastd xm2, [leftq] + movd xm0, [srcq+xq-1] add leftq, 4 + palignr xm0, xm2, 1 jmp .expand_x .no_left: vpbroadcastb xm0, [srcq+xq-1] jmp .expand_x .load_left_from_main: - pinsrd xm0, [srcq+xq-4], 3 + vpbroadcastd xm0, [srcq+xq-4] .expand_x: punpckhbw xm0, xm1 @@ -734,8 +749,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli .partial_load_and_extend: vpbroadcastb m3, [srcq-1] pmovzxbw m2, [srcq+xq] - punpcklbw m3, m1 movu m4, [r10+xq*2] + punpcklbw m3, m1 pand m2, m4 pandn m4, m3 por m2, m4 @@ -775,8 +790,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli paddd m7, m9 paddd m3, m5 movu [sumq+xq*2], m0 - movu [sumsqq+xq*4+ 0], xm7 - movu [sumsqq+xq*4+16], xm3 + movu [sumsqq+xq*4+ 0], xm7 + movu [sumsqq+xq*4+16], xm3 vextracti128 [sumsqq+xq*4+32], m7, 1 vextracti128 [sumsqq+xq*4+48], m3, 1 @@ -795,35 +810,35 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli cmp xd, xlimd jl .right_extend + add srcq, strideq add sumsqq, (384+16)*4 add sumq, (384+16)*2 - add srcq, strideq dec hd jg .loop_y RET INIT_YMM avx2 -cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim +cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem mov xq, -2 - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom sub ylimd, 3 ; -3 if have_bottom=0, else -1 .loop_x: lea yd, [hq+ylimq+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m10, [sum_ptrq+(384+16)*2*1] mova m2, m0 mova m3, m1 mova m4, m0 mova m5, m1 mova m6, m0 mova m7, m1 - movu m10, [sum_ptrq+(384+16)*2*1] mova m11, m10 mova m12, m10 mova m13, m10 @@ -833,10 +848,10 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] - mova m2, m0 - mova m3, m1 movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m2, m0 + mova m3, m1 mova m11, m10 .loop_y: movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] @@ -967,8 +982,10 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s RET INIT_YMM avx2 -cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \ +cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm vpbroadcastd m9, [pw_5_6] vpbroadcastd m12, [pw_256] psrlw m11, m12, 1 ; pw_128 @@ -1084,8 +1101,10 @@ cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \ RET INIT_YMM avx2 -cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt - vpbroadcastd m0, [wtq] +cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movifnidn hd, hm + vpbroadcastd m0, wtm vpbroadcastd m10, [pd_1024] DEFINE_ARGS dst, stride, t1, t2, w, h, idx .loop_y: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c index a1b25a90c8c..b0201ce3db1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_init_tmpl.c @@ -169,7 +169,7 @@ void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ const coef *t1, const coef *t2, \ const int w, const int h, \ - const int16_t wt[2]); \ + const uint32_t wt); \ \ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ const pixel (*const left)[4], \ @@ -194,7 +194,7 @@ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ - const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \ + const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ } \ } diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm index 35a4ea53b8d..aaaea7835b5 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_ssse3.asm @@ -188,13 +188,13 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %define srcptrq srcq %define dstptrq dstq %define hd dword [esp+ 0] - %define edged dword [esp+12] + %define edgeb byte [esp+12] %define xlimd dword [esp+16] %endif ; if (edge & has_right) align_w_to_16 ; else w -= 3, and use that as limit in x loop - test edged, 2 ; has_right + test edgeb, 2 ; has_right jnz .align mov xlimd, -3 jmp .loop @@ -221,7 +221,7 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %endif ; load left edge pixels - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .emu_left test leftq, leftq ; left == NULL for the edge-extended bottom/top jz .load_left_combined @@ -477,7 +477,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge DEFINE_ARGS dst, stride, mid, w, h, y, edge %define mptrq midq %define dstptrq dstq - %define edged dword [esp] + %define edgeb byte [esp] %endif ; main x loop for vertical filter, does one column of 16 pixels @@ -485,7 +485,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mova m3, [midq] ; middle line ; load top pixels - test edged, 4 ; have_top + test edgeb, 4 ; have_top jz .emu_top mova m0, [midq-384*4] mova m2, [midq-384*2] @@ -604,8 +604,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mova m3, m4 mova m4, m5 mova m5, m6 - add dstptrq, strideq add mptrq, 384*2 + add dstptrq, strideq dec yd jg .loop_load ; for the bottom pixels, continue using m6 (as extended edge) @@ -616,8 +616,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mov midq, [esp+8] mov dstq, [esp+4] %endif - add dstq, 8 add midq, 16 + add dstq, 8 sub wd, 8 jg .loop_x RET @@ -679,7 +679,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim %define wq r0m %define xlimd r1m %define hd hmp - %define edged edgemp + %define edgeb byte edgem mov r6, edgem and r6, 2 ; have_right @@ -706,7 +706,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main @@ -795,11 +795,13 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim movifnidn edged, edgem %else -cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y +cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y %define sumsq_baseq dword [esp+0] %define sum_baseq dword [esp+4] %define ylimd dword [esp+8] %define m8 [esp+12] + mov edged, r4m + mov hd, r3m %endif mov xq, -2 %if ARCH_X86_64 @@ -812,7 +814,7 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y .loop_x: mov sumsqq, sumsq_baseq mov sumq, sum_baseq - lea yd, [hd+ylimd+2] + lea yd, [hq+ylimq+2] %else mov yd, edged and yd, 8 ; have_bottom @@ -824,12 +826,12 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y .loop_x: mov sumsqd, sumsq_baseq mov sumd, sum_baseq - lea yd, [hd+2] + lea yd, [hq+2] add yd, ylimd %endif lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] lea sumq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsqq+(384+16)*4*1] movu m1, [sumsqq+(384+16)*4*1+16] @@ -1180,10 +1182,10 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] %endif + add srcq, strideq add aq, (384+16)*4 add bq, (384+16)*2 add tq, 384*2 - add srcq, strideq dec yd jg .loop_y add xd, 8 @@ -1237,7 +1239,7 @@ cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xli mova m11, [pb_0_1] %else cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - %define edged edgemp + %define edgeb byte edgem %define wd xd %define wq wd %define wm r5m @@ -1249,7 +1251,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge %define m11 [PIC_sym(pb_0_1)] %endif - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .no_right xor xlimd, xlimd add wd, 2 @@ -1275,7 +1277,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main @@ -1401,9 +1403,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge cmp xd, xlimd jl .right_extend + add srcq, strideq add sumsqq, (384+16)*4 add sumq, (384+16)*2 - add srcq, strideq dec hd jg .loop_y %if ARCH_X86_32 @@ -1434,7 +1436,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [hd+ylimd+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+16] @@ -1520,7 +1522,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [ylimd+2] add yd, hm lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - test dword edgem, 4 ; have_top + test byte edgem, 4 ; have_top jnz .sumsq_load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+16] @@ -1582,7 +1584,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [ylimd+2] add yd, hm lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test dword edgem, 4 ; have_top + test byte edgem, 4 ; have_top jnz .sum_load_top movu m0, [sum_ptrq+(384+16)*2*1] mova m1, m0 @@ -1882,7 +1884,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt movifnidn wd, wm - mov wtq, wtmp + movd m0, wtm %if ARCH_X86_64 movifnidn hd, hm mova m10, [pd_1024] @@ -1892,7 +1894,6 @@ cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt %define m10 [PIC_sym(pd_1024)] %define m11 m7 %endif - movd m0, [wtq] pshufd m0, m0, 0 DEFINE_ARGS dst, stride, t1, t2, w, h, idx %if ARCH_X86_32 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm index 7ff0cac070a..5d769df8db4 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.asm @@ -133,16 +133,23 @@ subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 pb_8x0_8x8: times 8 db 0 times 8 db 8 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 ALIGN 32 -resize_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: times 5 db 0 db 1, 2, 3, 4, 5, 6 times 5+8 db 7 @@ -154,8 +161,11 @@ wm_422_sign: dd 0x80808080, 0x7f7f7f7f wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040 ALIGN 4 +pb_0123: db 0, 1, 2, 3 +pb_4567: db 4, 5, 6, 7 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 +pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 @@ -163,11 +173,14 @@ pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 -pd_2: dd 2 -pd_32: dd 32 -pd_63: dd 63 -pd_512: dd 512 -pd_32768: dd 32768 +pd_2: dd 2 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 %define pb_m64 (wm_sign_avx512+4) %define pb_64 (wm_sign_avx512+8) @@ -230,27 +243,53 @@ cextern mc_warp_filter %endrep %endmacro +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX -BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 -BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 %if HAVE_AVX512ICL BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 @@ -1943,19 +1982,22 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 + mov t1d, FILTER_%4 +%ifnidn %1, sharp_smooth ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif -%macro PUT_8TAP_FN 3 ; type, type_h, type_v -cglobal put_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) -%endif -%endmacro + +%define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN regular, REGULAR, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP @@ -3873,6 +3915,1853 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET %endmacro +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+ r4] + movq xm%2, [srcq+ r6] + movhps xm%1, [srcq+ r7] + movhps xm%2, [srcq+ r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+ rX] + add srcq, ssq + movq xm%3, [srcq+ r4] + movq xm%4, [srcq+ r6] + movhps xm%3, [srcq+ r7] + movhps xm%4, [srcq+ r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+ rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+104] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + lea base_reg, [%1_8tap_scaled_avx2] +%define base base_reg-%1_8tap_scaled_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm14, mxd + vpbroadcastd m14, xm14 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%else + vpbroadcastd m14, mxm +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+96] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+96] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] +%ifidn %1, put + vpbroadcastd m13, [base+pd_512] +%else + vpbroadcastd m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + punpcklbw xm10, xm10 + psraw xm10, 8 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq +%else + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + mova [rsp], xm14 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.w8_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w8_loop + test myd, 0x400 + mov [rsp+16], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .w8_skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .w8_loop +.w8_skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .w8_loop +.w16: + mov dword [rsp+48], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+48], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+48], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+48], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+48] + jz .ret + add qword [rsp+64], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov r0q, [rsp+64] ; dstq / tmpq +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, rndshift + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vinserti128 m10, xm10, 1 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + pshufd xm4, xm4, q3120 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET +.dy1_w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp+32], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp+32] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_w8_loop +.dy1_w16: + mov dword [rsp+72], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+72] + jz .ret + add qword [rsp+88], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov r0q, [rsp+88] ; dstq / tmpq +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r4d, [rsp+64] + mov r7d, [rsp+68] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vpblendd m0, m1, m6, 0x0f + mova m1, m6 + vpermq m0, m0, q1032 ; 45 67 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +.dy2_w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m14, m11, q2222 + pshufd m11, m11, q3333 +.dy2_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m14 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_w8_loop +.dy2_w16: + mov dword [rsp+40], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+40] + jz .ret + add qword [rsp+56], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov r0q, [rsp+56] ; dstq / tmpq +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+32], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movq xm14, r4q + punpcklbw xm14, xm14 + psraw xm14, 8 + vinserti128 m14, xm14, 1 + mov r4d, [rsp+32] + mov r7d, [rsp+36] + pshufd m8, m14, q0000 + pshufd m9, m14, q1111 + pshufd m11, m14, q2222 + pshufd m14, m14, q3333 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +MC_8TAP_SCALED prep + %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+deltaq*4] @@ -5010,7 +6899,7 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ vpbroadcastd m3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti128 m15, [base+pb_8x0_8x8] - pmaddwd m2, m5, [base+resize_mul] ; dx*[0,1,2,3,4,5,6,7] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c index a21877c6671..a01ac14ab4a 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_init_tmpl.c @@ -52,33 +52,65 @@ decl_mc_fn(dav1d_put_bilin_ssse3); decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_avx2); decl_mct_fn(dav1d_prep_8tap_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sse2); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2); decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2); decl_mct_fn(dav1d_prep_bilin_avx512icl); decl_mct_fn(dav1d_prep_bilin_avx2); decl_mct_fn(dav1d_prep_bilin_ssse3); +decl_mct_fn(dav1d_prep_bilin_sse2); + +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); + +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); decl_avg_fn(dav1d_avg_avx512icl); decl_avg_fn(dav1d_avg_avx2); @@ -123,12 +155,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->mc[type] = dav1d_put_##name##_##suffix #define init_mct_fn(type, name, suffix) \ c->mct[type] = dav1d_prep_##name##_##suffix +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = dav1d_put_##name##_##suffix +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = dav1d_prep_##name##_##suffix + const unsigned flags = dav1d_get_cpu_flags(); if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; #if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + c->warp8x8 = dav1d_warp_affine_8x8_sse2; c->warp8x8t = dav1d_warp_affine_8x8t_sse2; #endif @@ -137,16 +185,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); @@ -187,16 +235,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); - init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); @@ -209,6 +257,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + c->avg = dav1d_avg_avx2; c->w_avg = dav1d_w_avg_avx2; c->mask = dav1d_mask_avx2; diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_ssse3.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_sse.asm index 8386897d42b..d98ac621eb9 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_ssse3.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_sse.asm @@ -66,6 +66,8 @@ resize_shuf: times 5 db 0 pb_64: times 16 db 64 pw_m256: times 8 dw -256 +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 pw_8: times 8 dw 8 pw_26: times 8 dw 26 pw_34: times 8 dw 34 @@ -117,6 +119,7 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 %endrep %endmacro +%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep) %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) @@ -155,6 +158,8 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %endif %endmacro +HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 @@ -738,15 +743,79 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak lea t0d, [hq+(7<<16)] jmp .hv_w16gt +%macro PSHUFB_0X1X 1-2 ; dst[, src] + %if cpuflag(ssse3) + pshufb %1, %2 + %else + punpcklbw %1, %1 + psraw %1, 8 + pshufd %1, %1, q0000 + %endif +%endmacro + +%macro PSHUFB_BILIN_H8 2 ; dst, src + %if cpuflag(ssse3) + pshufb %1, %2 + %else + mova %2, %1 + psrldq %1, 1 + punpcklbw %1, %2 + %endif +%endmacro + +%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp + %if cpuflag(ssse3) + pshufb %1, %2 + %else + mova %2, %1 + psrldq %1, 1 + punpckhbw %3, %1, %2 + punpcklbw %1, %2 + punpcklqdq %1, %3 + %endif +%endmacro + +%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero + %if cpuflag(ssse3) + pmaddubsw %1, %2 + %else + %if %5 == 1 + pxor %3, %3 + %endif + punpckhbw %4, %1, %3 + punpcklbw %1, %1, %3 + pmaddwd %4, %2 + pmaddwd %1, %2 + packssdw %1, %4 + %endif +%endmacro + +%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2 + %else + punpckhwd %3, %1, %4 + punpcklwd %1, %4 + pmaddwd %3, %2 + pmaddwd %1, %2 + psrad %3, %5 + psrad %1, %5 + packssdw %1, %3 + %endif +%endmacro + +%macro PREP_BILIN 0 + DECLARE_REG_TMP 3, 5, 6 %if ARCH_X86_32 - %define base t2-prep_ssse3 + %define base t2-prep%+SUFFIX %else %define base 0 %endif + cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - LEA t2, prep_ssse3 + LEA t2, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -755,6 +824,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 test mxyd, mxyd jnz .v .prep: +%if notcpuflag(ssse3) + add t2, prep_ssse3 - prep_sse2 + jmp prep_ssse3 +%else movzx wd, word [t2+wq*2+table_offset(prep,)] add wq, t2 lea stride3q, [strideq*3] @@ -824,10 +897,18 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .prep_w16 RET -.prep_w16gt: +.prep_w32: + mov t2d, 1 + jmp .prep_w32_vloop +.prep_w64: + mov t2d, 2 + jmp .prep_w32_vloop +.prep_w128: + mov t2d, 4 +.prep_w32_vloop: mov t1q, srcq - mov r3q, t2q -.prep_w16gt_hloop: + mov r3d, t2d +.prep_w32_hloop: movq m0, [t1q+8*0] movq m1, [t1q+8*1] movq m2, [t1q+8*2] @@ -847,45 +928,49 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+16*3], m3 add tmpq, 16*4 add t1q, 32 - sub r3q, 1 - jg .prep_w16gt_hloop + dec r3d + jg .prep_w32_hloop lea srcq, [srcq+strideq] - sub hd, 1 - jg .prep_w16gt + dec hd + jg .prep_w32_vloop RET -.prep_w32: - mov t2q, 1 - jmp .prep_w16gt -.prep_w64: - mov t2q, 2 - jmp .prep_w16gt -.prep_w128: - mov t2q, 4 - jmp .prep_w16gt +%endif .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 0xff01 +%if cpuflag(ssse3) mova m4, [base+bilin_h_shuf8] +%endif add mxyd, 16 << 8 - movd xm5, mxyd + movd m5, mxyd mov mxyd, r6m ; my +%if cpuflag(ssse3) pshuflw m5, m5, q0000 punpcklqdq m5, m5 +%else + PSHUFB_0X1X m5 +%endif test mxyd, mxyd jnz .hv %if ARCH_X86_32 mov t1, t2 ; save base reg for w4 %endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] +%if notcpuflag(ssse3) + WIN64_SPILL_XMM 8 + pxor m6, m6 +%endif add wq, t2 lea stride3q, [strideq*3] jmp wq .h_w4: -%if ARCH_X86_32 +%if cpuflag(ssse3) + %if ARCH_X86_32 mova m4, [t1-prep_ssse3+bilin_h_shuf4] -%else + %else mova m4, [bilin_h_shuf4] + %endif %endif .h_w4_loop: movq m0, [srcq+strideq*0] @@ -893,10 +978,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pmaddubsw m0, m5 - pshufb m1, m4 - pmaddubsw m1, m5 + PSHUFB_BILIN_H4 m0, m4, m2 + PMADDUBSW m0, m5, m6, m2, 0 + PSHUFB_BILIN_H4 m1, m4, m2 + PMADDUBSW m1, m5, m6, m2, 0 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 @@ -909,14 +994,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -931,14 +1016,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -947,52 +1032,60 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .h_w16 RET -.h_w16gt: +.h_w32: + mov t2d, 1 << 0 + jmp .h_w32_vloop +.h_w64: + mov t2d, 1 << 1 + jmp .h_w32_vloop +.h_w128: + mov t2d, 1 << 3 +.h_w32_vloop: mov t1q, srcq - mov r3q, t2q -.h_w16gt_hloop: + mov r3d, t2d +.h_w32_hloop: movu m0, [t1q+8*0] movu m1, [t1q+8*1] movu m2, [t1q+8*2] movu m3, [t1q+8*3] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add t1q, 32 - sub r3q, 1 - jg .h_w16gt_hloop + shr r3d, 1 + jnz .h_w32_hloop lea srcq, [srcq+strideq] sub hd, 1 - jg .h_w16gt + jg .h_w32_vloop RET -.h_w32: - mov t2q, 1 - jmp .h_w16gt -.h_w64: - mov t2q, 2 - jmp .h_w16gt -.h_w128: - mov t2q, 4 - jmp .h_w16gt .v: +%if notcpuflag(ssse3) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 +%endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 0xff01 add mxyd, 16 << 8 add wq, t2 lea stride3q, [strideq*3] movd m5, mxyd +%if cpuflag(ssse3) pshuflw m5, m5, q0000 punpcklqdq m5, m5 +%else + PSHUFB_0X1X m5 + pxor m6, m6 +%endif jmp wq .v_w4: movd m0, [srcq+strideq*0] @@ -1004,14 +1097,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 punpcklwd m0, m1 ; 0 1 _ _ punpcklwd m1, m2 ; 1 2 _ _ punpcklbw m1, m0 - pmaddubsw m1, m5 + PMADDUBSW m1, m5, m6, m7, 0 pshufd m1, m1, q3120 mova [tmpq+16*0], m1 movd m0, [srcq+strideq*0] punpcklwd m2, m3 ; 2 3 _ _ punpcklwd m3, m0 ; 3 4 _ _ punpcklbw m3, m2 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 pshufd m3, m3, q3120 mova [tmpq+16*1], m3 add tmpq, 32 @@ -1025,20 +1118,20 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m2, [srcq+strideq*1] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 + shufpd m4, m0, m1, 0x0c ; 0 2 movq m0, [srcq+strideq*0] - shufpd m2, m3, 0x0c ; 1 3 - shufpd m1, m0, 0x0c ; 2 4 + shufpd m2, m3, 0x0c ; 1 3 + shufpd m1, m0, 0x0c ; 2 4 punpcklbw m3, m2, m4 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m3 punpckhbw m3, m2, m4 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m3 punpcklbw m3, m1, m2 punpckhbw m1, m2 - pmaddubsw m3, m5 - pmaddubsw m1, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*1], m3 mova [tmpq+16*3], m1 add tmpq, 16*4 @@ -1052,14 +1145,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*2] punpcklbw m3, m1, m0 punpckhbw m4, m1, m0 - pmaddubsw m3, m5 - pmaddubsw m4, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m3 mova [tmpq+16*1], m4 punpcklbw m3, m2, m1 punpckhbw m4, m2, m1 - pmaddubsw m3, m5 - pmaddubsw m4, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m3 mova [tmpq+16*3], m4 movu m3, [srcq+stride3q ] @@ -1068,14 +1161,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 add tmpq, 16*8 punpcklbw m1, m3, m2 punpckhbw m4, m3, m2 - pmaddubsw m1, m5 - pmaddubsw m4, m5 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq-16*4], m1 mova [tmpq-16*3], m4 punpcklbw m1, m0, m3 punpckhbw m2, m0, m3 - pmaddubsw m1, m5 - pmaddubsw m2, m5 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq-16*2], m1 mova [tmpq-16*1], m2 sub hd, 4 @@ -1084,6 +1177,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w32: lea t2d, [hq+(0<<16)] mov t0d, 64 + jmp .v_w32_start +.v_w64: + lea t2d, [hq+(1<<16)] + mov t0d, 128 + jmp .v_w32_start +.v_w128: + lea t2d, [hq+(3<<16)] + mov t0d, 256 .v_w32_start: %if ARCH_X86_64 %if WIN64 @@ -1092,43 +1193,43 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov r7, tmpq %endif mov t1, srcq -.v_w32_loop_h: - movu m0, [srcq+strideq*0+16*0] ; 0L - movu m1, [srcq+strideq*0+16*1] ; 0U -.v_w32_loop_v: - movu m2, [srcq+strideq*1+16*0] ; 1L - movu m3, [srcq+strideq*1+16*1] ; 1U +.v_w32_hloop: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] +.v_w32_vloop: + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m2, m0 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpckhbw m4, m2, m0 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m4 punpcklbw m4, m3, m1 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpckhbw m4, m3, m1 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m4 add tmpq, t0q - movu m0, [srcq+strideq*0+16*0] ; 2L - movu m1, [srcq+strideq*0+16*1] ; 2U + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] punpcklbw m4, m0, m2 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpckhbw m4, m0, m2 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m4 punpcklbw m4, m1, m3 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpckhbw m4, m1, m3 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m4 add tmpq, t0q sub hd, 2 - jg .v_w32_loop_v + jg .v_w32_vloop movzx hd, t2w add t1, 32 mov srcq, t1 @@ -1141,62 +1242,78 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov tmpmp, tmpq %endif sub t2d, 1<<16 - jg .v_w32_loop_h + jg .v_w32_hloop %if WIN64 POP r7 %endif RET -.v_w64: - lea t2d, [hq+(1<<16)] - mov t0d, 128 - jmp .v_w32_start -.v_w128: - lea t2d, [hq+(3<<16)] - mov t0d, 256 - jmp .v_w32_start .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 8 +%assign stack_offset stack_offset - stack_size_padded +%if cpuflag(ssse3) + WIN64_SPILL_XMM 8 +%else + WIN64_SPILL_XMM 10 +%endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] +%if cpuflag(ssse3) shl mxyd, 11 - movd xm6, mxyd +%else + %if ARCH_X86_64 + mova m8, [pw_8] + %else + %define m8 [pw_8] + %endif + pxor m7, m7 +%endif + movd m6, mxyd add wq, t2 pshuflw m6, m6, q0000 +%if cpuflag(ssse3) punpcklqdq m6, m6 +%else + %if ARCH_X86_64 + psrlw m0, m8, 3 + punpcklwd m6, m0 + %else + punpcklwd m6, [base+pw_1] + %endif +%endif %if ARCH_X86_32 mov t1, t2 ; save base reg for w4 %endif lea stride3q, [strideq*3] jmp wq .hv_w4: -%if ARCH_X86_32 +%if cpuflag(ssse3) + %if ARCH_X86_32 mova m4, [t1-prep_ssse3+bilin_h_shuf4] -%else + %else mova m4, [bilin_h_shuf4] + %endif %endif - movq m0, [srcq+strideq*0] ; 0 _ - punpcklqdq m0, m0 - pshufb m0, m4 - pmaddubsw m0, m5 + movhps m0, [srcq+strideq*0] + PSHUFB_BILIN_H4 m0, m4, m3 + PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] - movhps m1, [srcq+strideq*2] ; 1 _ 2 _ + movhps m1, [srcq+strideq*2] movq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - movhps m2, [srcq+strideq*0] ; 3 _ 4 _ - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 ; 1 + 2 + - shufpd m3, m0, m1, 0x01 ; 0 + 1 + - pmaddubsw m0, m2, m5 ; 3 + 4 + - shufpd m2, m1, m0, 0x01 ; 2 + 3 + + movhps m2, [srcq+strideq*0] + PSHUFB_BILIN_H4 m1, m4, m3 + PSHUFB_BILIN_H4 m2, m4, m3 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 + shufpd m3, m0, m1, 0x01 ; 0 1 + mova m0, m2 + PMADDUBSW m0, m5, m7, m4, 0 ; 3 4 + shufpd m2, m1, m0, 0x01 ; 2 3 psubw m1, m3 - pmulhrsw m1, m6 + PMULHRSW m1, m6, m4, m8, 4 paddw m1, m3 psubw m3, m0, m2 - pmulhrsw m3, m6 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m2 mova [tmpq+16*0], m1 mova [tmpq+16*1], m3 @@ -1205,46 +1322,74 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .hv_w4_loop RET .hv_w8: - movu m0, [srcq+strideq*0] - pshufb m0, m4 - pmaddubsw m0, m5 ; 0 + + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: - movu m1, [srcq+strideq*1] ; 1 - movu m2, [srcq+strideq*2] ; 2 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 ; 1 + - pmaddubsw m2, m5 ; 2 + - psubw m3, m1, m0 ; 1-0 - pmulhrsw m3, m6 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 + PMADDUBSW m2, m5, m7, m4, 0 ; 2 + psubw m3, m1, m0 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 - psubw m7, m2, m1 ; 2-1 - pmulhrsw m7, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m9, m7 +%endif + psubw m7, m2, m1 + PMULHRSW m7, m6, m4, m8, 4 paddw m7, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m7 - movu m1, [srcq+stride3q ] ; 3 - lea srcq, [srcq+strideq*4] - movu m0, [srcq+strideq*0] ; 4 - pshufb m1, m4 - pshufb m0, m4 - pmaddubsw m1, m5 ; 3 + - pmaddubsw m0, m5 ; 4 + - psubw m3, m1, m2 ; 3-2 - pmulhrsw m3, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m7, m9 +%endif + movu m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3 + PMADDUBSW m0, m5, m7, m4, 0 ; 4 + psubw m3, m1, m2 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m2 - psubw m7, m0, m1 ; 4-3 - pmulhrsw m7, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m9, m7 +%endif + psubw m7, m0, m1 + PMULHRSW m7, m6, m4, m8, 4 paddw m7, m1 mova [tmpq+16*2], m3 mova [tmpq+16*3], m7 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m7, m9 + %else + pxor m7, m7 + %endif +%endif add tmpq, 16*4 sub hd, 4 jg .hv_w8_loop RET .hv_w16: - lea t2d, [hq+(0<<16)] + mov t2d, hd mov t0d, 32 + jmp .hv_w16_start +.hv_w32: + lea t2d, [hq+(1<<16)] + mov t0d, 64 + jmp .hv_w16_start +.hv_w64: + lea t2d, [hq+(3<<16)] + mov t0d, 128 + jmp .hv_w16_start +.hv_w128: + lea t2d, [hq+(7<<16)] + mov t0d, 256 .hv_w16_start: %if ARCH_X86_64 %if WIN64 @@ -1253,47 +1398,47 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov r7, tmpq %endif mov t1, srcq -.hv_w16_loop_h: - movu m0, [srcq+strideq*0+8*0] ; 0L - movu m1, [srcq+strideq*0+8*1] ; 0U - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 ; 0L + - pmaddubsw m1, m5 ; 0U + -.hv_w16_loop_v: - movu m2, [srcq+strideq*1+8*0] ; 1L - pshufb m2, m4 - pmaddubsw m2, m5 ; 1L + - psubw m3, m2, m0 ; 1L-0L - pmulhrsw m3, m6 +.hv_w16_hloop: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0a + PMADDUBSW m1, m5, m7, m4, 0 ; 0b +.hv_w16_vloop: + movu m2, [srcq+strideq*1+8*0] + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m2, m5, m7, m4, 0 ; 1a + psubw m3, m2, m0 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova [tmpq+16*0], m3 - movu m3, [srcq+strideq*1+8*1] ; 1U - lea srcq, [srcq+strideq*2] - pshufb m3, m4 - pmaddubsw m3, m5 ; 1U + - psubw m0, m3, m1 ; 1U-0U - pmulhrsw m0, m6 + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m3, m5, m7, m4, 0 ; 1b + psubw m0, m3, m1 + PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, t0q - movu m0, [srcq+strideq*0+8*0] ; 2L - pshufb m0, m4 - pmaddubsw m0, m5 ; 2L + - psubw m1, m0, m2 ; 2L-1L - pmulhrsw m1, m6 + movu m0, [srcq+strideq*0+8*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 2a + psubw m1, m0, m2 + PMULHRSW m1, m6, m4, m8, 4 paddw m1, m2 mova [tmpq+16*0], m1 - movu m1, [srcq+strideq*0+8*1] ; 2U - pshufb m1, m4 - pmaddubsw m1, m5 ; 2U + - psubw m2, m1, m3 ; 2U-1U - pmulhrsw m2, m6 + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 2b + psubw m2, m1, m3 + PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, t0q sub hd, 2 - jg .hv_w16_loop_v + jg .hv_w16_vloop movzx hd, t2w add t1, 16 mov srcq, t1 @@ -1306,23 +1451,12 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov tmpmp, tmpq %endif sub t2d, 1<<16 - jg .hv_w16_loop_h + jg .hv_w16_hloop %if WIN64 POP r7 %endif RET -.hv_w32: - lea t2d, [hq+(1<<16)] - mov t0d, 64 - jmp .hv_w16_start -.hv_w64: - lea t2d, [hq+(3<<16)] - mov t0d, 128 - jmp .hv_w16_start -.hv_w128: - lea t2d, [hq+(7<<16)] - mov t0d, 256 - jmp .hv_w16_start +%endmacro ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 @@ -2439,13 +2573,198 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w8_loop0 RET -%if ARCH_X86_32 -DECLARE_REG_TMP 1, 2 -%elif WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif +%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask + %if cpuflag(ssse3) + pshufb %1, %2 + %else + %if %5 == 1 + pcmpeqd %2, %2 + psrlq %2, 32 + %endif + psrldq %3, %1, 1 + pshufd %3, %3, q2301 + pand %1, %2 + pandn %4, %2, %3 + por %1, %4 + %endif +%endmacro + +%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %ifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %if notcpuflag(ssse3) + psrlq %1, %2, 16 + %elifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] + %if cpuflag(ssse3) + palignr %1, %2, %3, %4 + %else + %if %0 == 4 + %assign %%i regnumof%+%1 + 1 + %define %%tmp m %+ %%i + %else + %define %%tmp %5 + %endif + psrldq %1, %3, %4 + pslldq %%tmp, %2, 16-%4 + por %1, %%tmp + %endif +%endmacro + +%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 + %if cpuflag(ssse3) + phaddw %1, %2 + %else + %ifnidn %1, %2 + %if %4 == 1 + mova %3, [pw_1] + %endif + pmaddwd %1, %3 + pmaddwd %2, %3 + packssdw %1, %2 + %else + %if %4 == 1 + pmaddwd %1, [pw_1] + %else + pmaddwd %1, %3 + %endif + packssdw %1, %1 + %endif + %endif +%endmacro + +%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2, %3 + %else + paddw %1, %2, %3 + psraw %1, %4 + %endif +%endmacro + +%macro PMULHRSW_8192 3 ; dst, src1, src2 + PMULHRSW_POW2 %1, %2, %3, 2 +%endmacro + +%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] + movd %1, [%2+0] + movd %3, [%2+1] + movd %4, [%2+2] + movd %5, [%2+3] + punpckldq %1, %3 + punpckldq %4, %5 + punpcklqdq %1, %4 +%endmacro + +%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc + %if cpuflag(ssse3) + movu m%1, [%2] + pshufb m2, m%1, m11 ; subpel_h_shufB + pshufb m3, m%1, m9 ; subpel_h_shufC + pshufb m%1, m10 ; subpel_h_shufA + %else + %if ARCH_X86_64 + SWAP m12, m5 + SWAP m13, m6 + SWAP m14, m7 + %define %%mx0 m%+%%i + %define %%mx1 m%+%%j + %assign %%i 0 + %rep 12 + movd %%mx0, [%2+%%i] + %assign %%i %%i+1 + %endrep + %assign %%i 0 + %rep 6 + %assign %%j %%i+1 + punpckldq %%mx0, %%mx1 + %assign %%i %%i+2 + %endrep + %assign %%i 0 + %rep 3 + %assign %%j %%i+2 + punpcklqdq %%mx0, %%mx1 + %assign %%i %%i+4 + %endrep + SWAP m%1, m0 + SWAP m2, m4 + SWAP m3, m8 + SWAP m5, m12 + SWAP m6, m13 + SWAP m7, m14 + %else + PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 + SWAP m%1, m0 + %endif + %endif +%endmacro + +%macro PREP_8TAP_H 2 ; dst, src_memloc + PREP_8TAP_H_LOAD %1, %2 + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m8, m1 + SWAP m9, m7 + %endif + %xdefine mX m%+%1 + %assign %%i regnumof%+mX + %define mX m%+%%i + mova m4, m2 + PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 + PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 + PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 + PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 + %undef mX + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m1, m8 + SWAP m7, m9 + %endif + paddw m3, m4 + paddw m%1, m2 + PHADDW m%1, m3, m15, ARCH_X86_32 + %if ARCH_X86_64 || cpuflag(ssse3) + PMULHRSW_8192 m%1, m%1, m7 + %else + PMULHRSW_8192 m%1, m%1, [base+pw_2] + %endif +%endmacro + +%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2] + %if cpuflag(ssse3) + movu %1, [%2] + pshufb m2, %1, shufB + pshufb m3, %1, shufC + pshufb %1, shufA + %else + PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 + %endif +%endmacro + +%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] + PREP_8TAP_HV_LOAD %{1:4} + mova m1, m2 + PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 + PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 + PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 + PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 + paddw m1, m3 ; C0+B4 + paddw %1, m2 ; A0+C4 + PHADDW %1, m1, %3, 1 +%endmacro + %macro PREP_8TAP_FN 3 ; type, type_h, type_v cglobal prep_8tap_%1 mov t0d, FILTER_%2 @@ -2455,6 +2774,14 @@ cglobal prep_8tap_%1 %endif %endmacro +%macro PREP_8TAP 0 +%if ARCH_X86_32 + DECLARE_REG_TMP 1, 2 +%elif WIN64 + DECLARE_REG_TMP 6, 4 +%else + DECLARE_REG_TMP 6, 7 +%endif PREP_8TAP_FN regular, REGULAR, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH @@ -2467,14 +2794,13 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH %if ARCH_X86_32 %define base_reg r2 - %define base base_reg-prep_ssse3 + %define base base_reg-prep%+SUFFIX %define W32_RESTORE_SSQ mov strideq, stridem %else %define base_reg r7 %define base 0 %define W32_RESTORE_SSQ %endif - cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 @@ -2484,13 +2810,13 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movsxd wq, wm movifnidn srcd, srcm movifnidn hd, hm - LEA base_reg, prep_ssse3 test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v + LEA base_reg, prep_ssse3 tzcnt wd, wd - movzx wd, word [base_reg+wq*2+table_offset(prep,)] + movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] @@ -2501,25 +2827,49 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif jmp wq .h: + LEA base_reg, prep%+SUFFIX test myd, 0xf00 jnz .hv +%if cpuflag(ssse3) WIN64_SPILL_XMM 12 +%else + WIN64_SPILL_XMM 16 +%endif cmp wd, 4 je .h_w4 tzcnt wd, wd -%if ARCH_X86_64 +%if cpuflag(ssse3) + %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] + %else + %define m10 [base+subpel_h_shufA] + %define m11 [base+subpel_h_shufB] + %define m9 [base+subpel_h_shufC] + %endif %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0] + movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0] pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4] + movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4] pshufd m6, m6, q0000 +%if cpuflag(ssse3) mova m7, [base+pw_8192] +%else + punpcklbw m5, m5 + punpcklbw m6, m6 + psraw m5, 8 + psraw m6, 8 + %if ARCH_X86_64 + mova m7, [pw_2] + mova m15, [pw_1] + %else + %define m15 m4 + %endif +%endif add wq, base_reg jmp wq .h_w4: @@ -2529,39 +2879,115 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movzx mxd, mxb %endif dec srcq - movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] pshufd m4, m4, q0000 +%if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] +%else + mova m6, [base+pw_2] + %if ARCH_X86_64 + mova m14, [pw_1] + %else + %define m14 m7 + %endif + punpcklbw m4, m4 + psraw m4, 8 +%endif W32_RESTORE_SSQ %if ARCH_X86_64 lea stride3q, [strideq*3] %endif .h_w4_loop: +%if cpuflag(ssse3) movq m0, [srcq+strideq*0] ; 0 movq m1, [srcq+strideq*1] ; 1 -%if ARCH_X86_32 + %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m2, [srcq+strideq*0] ; 2 movq m3, [srcq+strideq*1] ; 3 lea srcq, [srcq+strideq*2] -%else + %else movq m2, [srcq+strideq*2] ; 2 movq m3, [srcq+stride3q ] ; 3 lea srcq, [srcq+strideq*4] -%endif - pshufb m0, m5 ; subpel_h_shufA + %endif + pshufb m0, m5 pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 - pmaddubsw m0, m4 ; subpel_filters + 2 - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmaddubsw m3, m4 - phaddw m0, m1 - phaddw m2, m3 - pmulhrsw m0, m6 ; pw_8192 - pmulhrsw m2, m6 ; pw_8192 +%else + %if ARCH_X86_64 + movd m0, [srcq+strideq*0+0] + movd m12, [srcq+strideq*0+1] + movd m1, [srcq+strideq*1+0] + movd m5, [srcq+strideq*1+1] + movd m2, [srcq+strideq*2+0] + movd m13, [srcq+strideq*2+1] + movd m3, [srcq+stride3q +0] + movd m7, [srcq+stride3q +1] + punpckldq m0, m12 + punpckldq m1, m5 + punpckldq m2, m13 + punpckldq m3, m7 + movd m12, [srcq+strideq*0+2] + movd m8, [srcq+strideq*0+3] + movd m5, [srcq+strideq*1+2] + movd m9, [srcq+strideq*1+3] + movd m13, [srcq+strideq*2+2] + movd m10, [srcq+strideq*2+3] + movd m7, [srcq+stride3q +2] + movd m11, [srcq+stride3q +3] + lea srcq, [srcq+strideq*4] + punpckldq m12, m8 + punpckldq m5, m9 + punpckldq m13, m10 + punpckldq m7, m11 + punpcklqdq m0, m12 ; 0 + punpcklqdq m1, m5 ; 1 + punpcklqdq m2, m13 ; 2 + punpcklqdq m3, m7 ; 3 + %else + movd m0, [srcq+strideq*0+0] + movd m1, [srcq+strideq*0+1] + movd m2, [srcq+strideq*0+2] + movd m3, [srcq+strideq*0+3] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklqdq m0, m2 ; 0 + movd m1, [srcq+strideq*1+0] + movd m2, [srcq+strideq*1+1] + movd m3, [srcq+strideq*1+2] + movd m7, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m1, m2 + punpckldq m3, m7 + punpcklqdq m1, m3 ; 1 + movd m2, [srcq+strideq*0+0] + movd m3, [srcq+strideq*0+1] + movd m7, [srcq+strideq*0+2] + movd m5, [srcq+strideq*0+3] + punpckldq m2, m3 + punpckldq m7, m5 + punpcklqdq m2, m7 ; 2 + movd m3, [srcq+strideq*1+0] + movd m7, [srcq+strideq*1+1] + punpckldq m3, m7 + movd m7, [srcq+strideq*1+2] + movd m5, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m7, m5 + punpcklqdq m3, m7 ; 3 + %endif +%endif + PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 + PMADDUBSW m1, m4, m5, m7, 0 + PMADDUBSW m2, m4, m5, m7, 0 + PMADDUBSW m3, m4, m5, m7, 0 + PHADDW m0, m1, m14, ARCH_X86_32 + PHADDW m2, m3, m14, 0 + PMULHRSW_8192 m0, m0, m6 + PMULHRSW_8192 m2, m2, m6 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 32 @@ -2569,55 +2995,41 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w4_loop RET ; -%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3] -%if ARCH_X86_32 - pshufb %2, %1, [base+subpel_h_shufB] - pshufb %3, %1, [base+subpel_h_shufC] - pshufb %1, [base+subpel_h_shufA] -%else - pshufb %2, %1, m11; subpel_h_shufB - pshufb %3, %1, m9 ; subpel_h_shufC - pshufb %1, m10 ; subpel_h_shufA -%endif - pmaddubsw %4, %2, m5 ; subpel +0 B0 - pmaddubsw %2, m6 ; subpel +4 B4 - pmaddubsw %3, m6 ; subpel +4 C4 - pmaddubsw %1, m5 ; subpel +0 A0 - paddw %3, %4 - paddw %1, %2 - phaddw %1, %3 - pmulhrsw %1, m7 ; 8192 -%endmacro - ; .h_w8: %if ARCH_X86_32 mov r3, r2 - %define base_reg r3 + %define base_reg r3 W32_RESTORE_SSQ %endif .h_w8_loop: - movu m0, [srcq+strideq*0] - movu m1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - PREP_8TAP_H m0, m2, m3, m4 - PREP_8TAP_H m1, m2, m3, m4 +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 + lea srcq, [srcq+strideq*2] add tmpq, 32 sub hd, 2 +%else + PREP_8TAP_H 0, srcq + mova [tmpq], m0 + add srcq, strideq + add tmpq, 16 + dec hd +%endif jg .h_w8_loop RET .h_w16: - xor r6d, r6d + mov r6, -16*1 jmp .h_start .h_w32: - mov r6, -16*1 + mov r6, -16*2 jmp .h_start .h_w64: - mov r6, -16*3 + mov r6, -16*4 jmp .h_start .h_w128: - mov r6, -16*7 + mov r6, -16*8 .h_start: %if ARCH_X86_32 mov r3, r2 @@ -2627,15 +3039,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov r5, r6 W32_RESTORE_SSQ .h_loop: - movu m0, [srcq+r6+8*0] - movu m1, [srcq+r6+8*1] - PREP_8TAP_H m0, m2, m3, m4 - PREP_8TAP_H m1, m2, m3, m4 +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+r6+8*0 + PREP_8TAP_H 1, srcq+r6+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r6, 16 - jle .h_loop +%else + PREP_8TAP_H 0, srcq+r6 + mova [tmpq], m0 + add tmpq, 16 + add r6, 8 +%endif + jl .h_loop add srcq, strideq mov r6, r5 dec hd @@ -2644,8 +3061,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %if ARCH_X86_32 %define base_reg r2 %endif - + ; .v: + LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f @@ -2657,30 +3075,40 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3] + lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) mova m2, [base+pw_512] psrlw m2, m2, 1 ; 0x0100 mova m7, [base+pw_8192] +%endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed + %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 + %else + ALLOC_STACK -mmsize*5 + %endif %assign regs_used 7 movd m0, [myq+0] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel0, m0 movd m0, [myq+2] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel1, m0 movd m0, [myq+4] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel2, m0 movd m0, [myq+6] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel3, m0 + %if notcpuflag(ssse3) + mov r6, base_reg + %define base_reg r6 + %endif mov strideq, [rstk+stack_offset+gprsize*3] lea strideq, [strideq*3] sub [rstk+stack_offset+gprsize*2], strideq @@ -2692,25 +3120,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpel2 m10 %define subpel3 m11 movd subpel0, [myq+0] - pshufb subpel0, m2 + PSHUFB_0X1X subpel0, m2 movd subpel1, [myq+2] - pshufb subpel1, m2 + PSHUFB_0X1X subpel1, m2 movd subpel2, [myq+4] - pshufb subpel2, m2 + PSHUFB_0X1X subpel2, m2 movd subpel3, [myq+6] - pshufb subpel3, m2 + PSHUFB_0X1X subpel3, m2 lea stride3q, [strideq*3] sub srcq, stride3q cmp wd, 8 - jg .v_w16 - je .v_w8 + jns .v_w8 %endif .v_w4: -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize - %define srcm [rsp+mmsize*4+gprsize*1] - %define tmpm [rsp+mmsize*4+gprsize*2] +%if notcpuflag(ssse3) + pxor m6, m6 + %if ARCH_X86_64 + mova m7, [base+pw_2] + %endif %endif +%if ARCH_X86_32 + %if STACK_ALIGNMENT < mmsize + %define srcm [esp+stack_size+gprsize*1] + %define tmpm [esp+stack_size+gprsize*2] + %endif mov tmpm, tmpq mov srcm, srcq lea r5d, [wq - 4] ; horizontal loop @@ -2743,17 +3176,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif punpckldq m3, m1 ; 4 5 _ _ punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 + PALIGNR m4, m3, m2, 4 ; 1 2 3 4 punpcklbw m3, m1 ; 45 56 punpcklbw m1, m2, m4 ; 01 12 punpckhbw m2, m4 ; 23 34 .v_w4_loop: - pmaddubsw m5, m1, subpel0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel0 + %define subpel0 m7 +%endif + mova m5, m1 + PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel1 + %define subpel1 m7 +%endif mova m1, m2 - pmaddubsw m2, subpel1 ; a1 b1 + PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 paddw m5, m2 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel2 + %define subpel2 m7 +%endif mova m2, m3 - pmaddubsw m3, subpel2 ; a2 b2 + PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 paddw m5, m3 movd m4, [srcq+strideq*0] punpckldq m3, m0, m4 ; 6 7 _ _ @@ -2761,9 +3207,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 lea srcq, [srcq+strideq*2] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 - pmaddubsw m4, m3, subpel3 ; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m12, m0 + %else + mova [esp+mmsize*4], m0 + mova m7, subpel3 + %define subpel3 m7 + %endif +%endif + mova m4, m3 + PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 paddw m5, m4 - pmulhrsw m5, m7 +%if ARCH_X86_64 || cpuflag(ssse3) + %if notcpuflag(ssse3) + SWAP m0, m12 + %endif + PMULHRSW_8192 m5, m5, m7 +%else + mova m0, [esp+mmsize*4] + PMULHRSW_8192 m5, m5, [base+pw_2] +%endif movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] @@ -2781,26 +3245,28 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w4_loop0 %endif RET - +%if ARCH_X86_32 && notcpuflag(ssse3) + %define base_reg r2 +%endif + ; %if ARCH_X86_64 .v_w8: -.v_w16: lea r5d, [wq - 8] ; horizontal loop mov r8, tmpq mov r6, srcq shl r5d, 8 - 3; (wq / 8) << 8 mov r5b, hb .v_w8_loop0: - movq m4, [srcq+strideq*0] ; 0 - movq m5, [srcq+strideq*1] ; 1 + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m6, [srcq+strideq*0] ; 2 - movq m0, [srcq+strideq*1] ; 3 + movq m6, [srcq+strideq*0] + movq m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movq m1, [srcq+strideq*0] + movq m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m1, [srcq+strideq*0] ; 4 - movq m2, [srcq+strideq*1] ; 5 - lea srcq, [srcq+strideq*2] ; - movq m3, [srcq+strideq*0] ; 6 + movq m3, [srcq+strideq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 @@ -2812,9 +3278,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w8_loop: - movq m12, [srcq+strideq*1] ; 8 +%if cpuflag(ssse3) + movq m12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] ; 9 + movq m13, [srcq+strideq*0] pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 @@ -2839,8 +3306,43 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 - movu [tmpq+wq*0], xm14 - movu [tmpq+wq*2], xm15 + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 +%else + mova m14, m1 + PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m1, m3 + PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + paddw m14, m3 + mova m3, m5 + PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 + paddw m14, m5 + movq m12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movq m13, [srcq+strideq*0] + shufpd m15, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m15, m0 ; 67 + punpckhbw m15, m0 ; 78 + mova m13, m5 + PMADDUBSW m13, subpel3, m7, m12, 0 ; a3 + paddw m14, m13 + PMULHRSW_8192 m14, m14, [base+pw_2] + movu [tmpq+wq*0], m14 + mova m14, m2 + PMADDUBSW m14, subpel0, m7, m12, 0 ; b0 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 + paddw m14, m4 + mova m4, m6 + PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 + paddw m14, m6 + mova m6, m15 + PMADDUBSW m15, subpel3, m7, m12, 0 ; b3 + paddw m14, m15 + PMULHRSW_8192 m14, m14, [base+pw_2] + movu [tmpq+wq*2], m14 +%endif lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop @@ -2857,20 +3359,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel1 %undef subpel2 %undef subpel3 - + ; .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 and mxd, 0x7f - movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov r5, r2; use as new base %define base_reg r5 %assign regs_used 2 @@ -2886,7 +3388,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 - psraw m0, 8 ; sign-extend + psraw m0, 8 pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 @@ -2900,8 +3402,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) ALLOC_STACK mmsize*14, 14 + %else + ALLOC_STACK mmsize*14, 16 + %endif lea stride3q, [strideq*3] sub srcq, stride3q dec srcq @@ -2910,8 +3416,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 - psraw m0, 8 ; sign-extend + psraw m0, 8 + %if cpuflag(ssse3) mova m8, [base+pw_8192] + %else + mova m8, [base+pw_2] + %endif mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 @@ -2919,7 +3429,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 -.hv_w4: +%if notcpuflag(ssse3) + punpcklbw m7, m7 + psraw m7, 8 +%endif %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 @@ -2930,17 +3443,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 - ; - ; %if ARCH_X86_32 - %define w8192reg [base+pw_8192] + %if cpuflag(ssse3) + %define w8192reg [base+pw_8192] + %else + %define w8192reg [base+pw_2] + %endif %define d32reg [base+pd_32] %else %define w8192reg m8 %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%else + %if ARCH_X86_64 + mova m15, [pw_1] + %else + %define m15 m1 + %endif +%endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ movq m4, [srcq+strideq*2] ; 2 _ _ _ @@ -2953,43 +3476,61 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif - pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ - pmaddubsw m2, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m2, m0 ;H 0 1 2 3 - pmulhrsw m2, w8192reg ;H pw_8192 + PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] - pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ - pmaddubsw m2, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m2, m0 ;H 0 1 2 3 - pmulhrsw m2, w8192reg ;H pw_8192 - ; +%endif + PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m2 + %else + mova [esp+mmsize*4], m2 + %endif +%endif ; lower shuffle +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ movq m4, [srcq+strideq*2] ; 6 _ _ _ - pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ - pmaddubsw m3, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m3, m0 ;H 4 5 6 7 - pmulhrsw m3, w8192reg ;H pw_8192 + PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] - pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ - pmaddubsw m3, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m3, m0 ;H 4 5 6 7 - pmulhrsw m3, w8192reg ;H pw_8192 - ; +%endif + PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m2, m14 + %else + mova m2, [esp+mmsize*4] + %endif +%endif %if ARCH_X86_32 lea srcq, [srcq+strideq*2] add srcq, strideq @@ -2997,7 +3538,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add srcq, stride3q %endif ;process high - palignr m4, m3, m2, 4;V 1 2 3 4 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3009,7 +3550,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 - palignr m4, m3, m2, 4;V 1 2 3 4 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3023,18 +3564,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+mmsize*4], m5 + %define m15 m3 + %endif +%endif ; +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ - pmaddubsw m4, m7 ;H subpel_filters - phaddw m4, m4 ;H 7 8 7 8 - pmulhrsw m4, w8192reg ;H pw_8192 - palignr m3, m4, m0, 12 ; 6 7 8 7 + PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+mmsize*4] + %endif +%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 @@ -3055,18 +3613,34 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+0xA0], m5 + %endif +%endif ; +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] +%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ - pmaddubsw m4, m7 ;H subpel_filters - phaddw m4, m4 ;H 7 8 7 8 - pmulhrsw m4, w8192reg ;H pw_8192 - palignr m3, m4, m0, 12 ; 6 7 8 7 + PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+0xA0] + %endif +%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 @@ -3093,8 +3667,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpelv2 %undef subpelv3 ; - - .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -3113,27 +3685,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] - movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] ALLOC_STACK -mmsize*13 -%if STACK_ALIGNMENT < mmsize + %if STACK_ALIGNMENT < mmsize mov rstk, r2m - %define tmpm [rsp+mmsize*13+gprsize*1] - %define srcm [rsp+mmsize*13+gprsize*2] - %define stridem [rsp+mmsize*13+gprsize*3] + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] mov stridem, rstk -%endif + %endif mov r6, r2 -%define base_reg r6 + %define base_reg r6 pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 - psraw m5, 8 ; sign-extend + %if notcpuflag(ssse3) + punpcklbw m0, m0 + punpcklbw m1, m1 + %endif + psraw m5, 8 + %if notcpuflag(ssse3) + psraw m0, 8 + psraw m1, 8 + %endif pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 @@ -3160,20 +3740,31 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 - movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 - psraw m1, 8 ; sign-extend + %if notcpuflag(ssse3) + punpcklbw subpelh0, subpelh0 + punpcklbw subpelh1, subpelh1 + %endif + psraw m1, 8 + %if notcpuflag(ssse3) + psraw subpelh0, 8 + psraw subpelh1, 8 + %endif pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 + %if notcpuflag(ssse3) + mova m7, [base+pw_2] + %endif lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q @@ -3188,57 +3779,89 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shl r5d, (16 - 2) mov r5w, hw .hv_w8_loop0: - movu m4, [srcq+strideq*0] ; 0 = _ _ - movu m5, [srcq+strideq*1] ; 1 = _ _ - lea srcq, [srcq+strideq*2] -%if ARCH_X86_64 +%if cpuflag(ssse3) + %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] + %define shufA m7 + %define shufB m8 + %define shufC m9 + %else + %define shufA [base+subpel_h_shufA] + %define shufB [base+subpel_h_shufB] + %define shufC [base+subpel_h_shufC] + %endif %endif - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ - movu m6, [srcq+strideq*0] ; 2 = _ _ - movu m0, [srcq+strideq*1] ; 3 = _ _ + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 lea srcq, [srcq+strideq*2] - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ - ; +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m9, m4 + %else + mova [esp], m4 + %endif +%endif + PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 + PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 + lea srcq, [srcq+strideq*2] +%if cpuflag(ssse3) mova m7, [base+pw_8192] - pmulhrsw m4, m7 ; H pw_8192 - pmulhrsw m5, m7 ; H pw_8192 - pmulhrsw m6, m7 ; H pw_8192 - pmulhrsw m0, m7 ; H pw_8192 - punpcklwd m1, m4, m5 ; 0 1 ~ - punpcklwd m2, m5, m6 ; 1 2 ~ - punpcklwd m3, m6, m0 ; 2 3 ~ +%else + mova m7, [base+pw_2] + %if ARCH_X86_64 + SWAP m4, m9 + %else + mova m4, [esp] + %endif +%endif + PMULHRSW_8192 m4, m4, m7 + PMULHRSW_8192 m5, m5, m7 + PMULHRSW_8192 m6, m6, m7 + PMULHRSW_8192 m0, m0, m7 + punpcklwd m1, m4, m5 ; 01 + punpcklwd m2, m5, m6 ; 12 + punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 - ; +%if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] - movu m4, [srcq+strideq*0] ; 4 = _ _ - movu m5, [srcq+strideq*1] ; 5 = _ _ +%else + %if ARCH_X86_64 + SWAP m8, m7 + SWAP m9, m0 + %else + mova [esp+0x30], m0 + %endif +%endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 lea srcq, [srcq+strideq*2] - movu m6, [srcq+strideq*0] ; 6 = _ _ - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ +%if cpuflag(ssse3) mova m7, [base+pw_8192] - pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ - pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ - pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ - punpcklwd m4, m0, m1 ; 3 4 ~ - punpcklwd m5, m1, m2 ; 4 5 ~ - punpcklwd m6, m2, m3 ; 5 6 ~ - ; +%else + %if ARCH_X86_64 + SWAP m0, m9 + SWAP m7, m8 + %else + mova m0, [esp+0x30] + mova m7, [base+pw_2] + %endif +%endif + PMULHRSW_8192 m1, m4, m7 + PMULHRSW_8192 m2, m5, m7 + PMULHRSW_8192 m3, m6, m7 + punpcklwd m4, m0, m1 ; 34 + punpcklwd m5, m1, m2 ; 45 + punpcklwd m6, m2, m3 ; 56 SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: - ; m8 accu for V a - ; m9 accu for V b SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 @@ -3255,46 +3878,53 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_32] - paddd m0, m5 ; pd_512 - paddd m7, m5 ; pd_512 + paddd m0, m5 + paddd m7, m5 mova accuv0, m0 mova accuv1, m7 %else - pmaddwd m8, m1, subpelv0 ; a0 - pmaddwd m9, m2, subpelv0 ; b0 + pmaddwd accuv0, m1, subpelv0 ; a0 + pmaddwd accuv1, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 - paddd m8, m3 - paddd m9, m4 + paddd accuv0, m3 + paddd accuv1, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 - paddd m8, m5 - paddd m9, m6 + paddd accuv0, m5 + paddd accuv1, m6 mova m7, [base+pd_32] - paddd m8, m7 ; pd_512 - paddd m9, m7 ; pd_512 + paddd accuv0, m7 + paddd accuv1, m7 + %if cpuflag(ssse3) mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] + %define shufA m5 + %define shufB m7 + %define shufC m6 + %endif %endif - movu m0, [srcq+strideq*1] ; 7 - movu m4, [srcq+strideq*2] ; 8 + PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 + PREP_8TAP_HV m4, srcq+strideq*2, m5, m6 lea srcq, [srcq+strideq*2] - HV_H_W8 m0, m1, m2, m3, m5, m7, m6 - HV_H_W8 m4, m1, m2, m3, m5, m7, m6 +%if cpuflag(ssse3) mova m5, [base+pw_8192] - pmulhrsw m0, m5 ; H pw_8192 - pmulhrsw m4, m5 ; H pw_8192 +%else + mova m5, [base+pw_2] +%endif + PMULHRSW_8192 m0, m0, m5 + PMULHRSW_8192 m4, m4, m5 RESTORELINE_W8 6, m6 - punpcklwd m5, m6, m0 ; 6 7 ~ - punpcklwd m6, m0, m4 ; 7 8 ~ + punpcklwd m5, m6, m0 ; 67 + punpcklwd m6, m0, m4 ; 78 pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 - paddd m1, m1, accuv1 ; H + V + paddd m1, m1, accuv1 psrad m2, 6 psrad m1, 6 - packssdw m2, m1 ; d -> w + packssdw m2, m1 movq [tmpq+wq*0], m2 movhps [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] @@ -3323,6 +3953,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 sub r5d, 1<<16 jg .hv_w8_loop0 RET +%endmacro %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 @@ -3393,7 +4024,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endmacro %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 - ; Can be done using gathers, but that's terribly slow on many CPU:s %if ARCH_X86_32 %define m8 m4 %define m9 m5 @@ -4031,20 +4661,6 @@ ALIGN function_align ret %endmacro -INIT_XMM sse4 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM ssse3 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM sse2 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM ssse3 - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -5091,7 +5707,6 @@ cextern resize_filter %endif %endmacro -INIT_XMM ssse3 %if ARCH_X86_64 cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 @@ -5302,3 +5917,19 @@ cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ %endif jg .loop_y RET + +INIT_XMM ssse3 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse4 +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse2 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm b/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm index f6787148392..756e19b4bb9 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.asm @@ -157,7 +157,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov [t7+msac.rng], t2d not t4 sub t1d, ecx - jge .end ; no refill required + jae .end ; no refill required ; refill: mov t2, [t7+msac.buf] @@ -504,7 +504,7 @@ cglobal msac_decode_bool, 0, 6, 0 mov [t7+msac.rng], t2d not t4 sub t5d, ecx - jge %%end + jae %%end mov t2, [t7+msac.buf] mov rcx, [t7+msac.end] %if UNIX64 == 0 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c index a9dafc757ce..a634da27c4e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c @@ -28,6 +28,7 @@ #include "src/msac.h" #include "src/x86/msac.h" +#if ARCH_X86_64 void dav1d_msac_init_x86(MsacContext *const s) { const unsigned flags = dav1d_get_cpu_flags(); @@ -39,4 +40,4 @@ void dav1d_msac_init_x86(MsacContext *const s) { s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; } } - +#endif diff --git a/chromium/third_party/dav1d/libdav1d/tools/dav1d.c b/chromium/third_party/dav1d/libdav1d/tools/dav1d.c index 97c78014695..4b97a9f20f3 100644 --- a/chromium/third_party/dav1d/libdav1d/tools/dav1d.c +++ b/chromium/third_party/dav1d/libdav1d/tools/dav1d.c @@ -63,7 +63,9 @@ static uint64_t get_time_nanos(void) { QueryPerformanceFrequency(&frequency); LARGE_INTEGER t; QueryPerformanceCounter(&t); - return 1000000000 * t.QuadPart / frequency.QuadPart; + uint64_t seconds = t.QuadPart / frequency.QuadPart; + uint64_t fractions = t.QuadPart % frequency.QuadPart; + return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart; #elif defined(HAVE_CLOCK_GETTIME) struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); @@ -245,7 +247,7 @@ int main(const int argc, char *const *const argv) { if ((res = output_write(out, &p)) < 0) break; n_out++; - if (nspf) { + if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } @@ -282,7 +284,7 @@ int main(const int argc, char *const *const argv) { if ((res = output_write(out, &p)) < 0) break; n_out++; - if (nspf) { + if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } diff --git a/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c b/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c index 4221feee077..f363033edae 100644 --- a/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c +++ b/chromium/third_party/dav1d/libdav1d/tools/dav1d_cli_parse.c @@ -118,7 +118,7 @@ static void usage(const char *const app, const char *const reason, ...) { " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" - " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n" + " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/input.c b/chromium/third_party/dav1d/libdav1d/tools/input/input.c index d8a56c1822f..3ed6983acee 100644 --- a/chromium/third_party/dav1d/libdav1d/tools/input/input.c +++ b/chromium/third_party/dav1d/libdav1d/tools/input/input.c @@ -82,6 +82,10 @@ int input_open(DemuxerContext **const c_out, return DAV1D_ERR(ENOMEM); } FILE *f = fopen(filename, "rb"); + if (!f) { + fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno)); + return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO); + } res = !!fread(probe_data, 1, probe_sz, f); fclose(f); if (!res) { diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c b/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c index 746391d4c12..7b572ee73c5 100644 --- a/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c +++ b/chromium/third_party/dav1d/libdav1d/tools/input/ivf.c @@ -28,6 +28,7 @@ #include "config.h" #include <errno.h> +#include <limits.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> @@ -92,8 +93,27 @@ static int ivf_open(IvfInputContext *const c, const char *const file, break; // EOF fseeko(c->f, rl32(data) + 8, SEEK_CUR); } - fps[0] = timebase[0] * *num_frames; - fps[1] = timebase[1] * duration; + + uint64_t fps_num = (uint64_t) timebase[0] * *num_frames; + uint64_t fps_den = (uint64_t) timebase[1] * duration; + if (fps_num && fps_den) { /* Reduce fraction */ + uint64_t gcd = fps_num; + for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b); + fps_num /= gcd; + fps_den /= gcd; + + while ((fps_num | fps_den) > UINT_MAX) { + fps_num >>= 1; + fps_den >>= 1; + } + } + if (fps_num && fps_den) { + fps[0] = (unsigned) fps_num; + fps[1] = (unsigned) fps_den; + } else { + fps[0] = fps[1] = 0; + } + fseeko(c->f, 32, SEEK_SET); return 0; diff --git a/chromium/third_party/dav1d/libdav1d/tools/input/parse.h b/chromium/third_party/dav1d/libdav1d/tools/input/parse.h index bebea21daf7..f5805e8ca45 100644 --- a/chromium/third_party/dav1d/libdav1d/tools/input/parse.h +++ b/chromium/third_party/dav1d/libdav1d/tools/input/parse.h @@ -29,22 +29,24 @@ #ifndef DAV1D_INPUT_PARSE_H #define DAV1D_INPUT_PARSE_H +#include <limits.h> + #include "dav1d/headers.h" static int leb128(FILE *const f, size_t *const len) { + uint64_t val = 0; unsigned i = 0, more; - *len = 0; do { - uint8_t byte; - if (fread(&byte, 1, 1, f) < 1) + uint8_t v; + if (fread(&v, 1, 1, f) < 1) return -1; - more = byte & 0x80; - const unsigned bits = byte & 0x7f; - if (i <= 3 || (i == 4 && bits < (1 << 4))) - *len |= bits << (i * 7); - else if (bits) return -1; - if (++i == 8 && more) return -1; - } while (more); + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << (i * 7); + i++; + } while (more && i < 8); + if (val > UINT_MAX || more) + return -1; + *len = (size_t) val; return i; } @@ -52,18 +54,18 @@ static int leb128(FILE *const f, size_t *const len) { // with author's permission static int leb(const uint8_t *ptr, int sz, size_t *const len) { + uint64_t val = 0; unsigned i = 0, more; - *len = 0; do { if (!sz--) return -1; - const int byte = *ptr++; - more = byte & 0x80; - const unsigned bits = byte & 0x7f; - if (i <= 3 || (i == 4 && bits < (1 << 4))) - *len |= bits << (i * 7); - else if (bits) return -1; - if (++i == 8 && more) return -1; - } while (more); + const int v = *ptr++; + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << (i * 7); + i++; + } while (more && i < 8); + if (val > UINT_MAX || more) + return -1; + *len = (size_t) val; return i; } |