diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-12-02 14:31:49 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-12-03 11:56:22 +0100 |
commit | 9fa056ba75c089b5120366ab7c5ce8cc4c5bd67a (patch) | |
tree | f05f5ff297ed51c916d5e7fdb16c6805c4ba4ca5 | |
parent | 242f1152bf906a4a3164a9a8e40bd52723bd5afe (diff) | |
download | ffmpeg-9fa056ba75c089b5120366ab7c5ce8cc4c5bd67a.tar.gz |
pngdsp x86: use unaligned access
For test images manually generated to contain only up prediction,
timing results:
8380x3032 255x185
before: 138635 1992
after: 139232 1996
Actually jumping to the proper version depending on the alignment:
8380x3032: 138767
A 0.5% speed improvement for gigantic images is not worth the code
duplication.
Fixes ticket #4148
Signed-off-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Tested-by: Benoit Fouet <benoit.fouet@free.fr>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/pngdsp.h | 4 | ||||
-rw-r--r-- | libavcodec/x86/pngdsp.asm | 12 |
2 files changed, 8 insertions, 8 deletions
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h index 1475b0cbe9..fbc1a508e7 100644 --- a/libavcodec/pngdsp.h +++ b/libavcodec/pngdsp.h @@ -25,9 +25,9 @@ #include <stdint.h> typedef struct PNGDSPContext { - void (*add_bytes_l2)(uint8_t *dst /* align 16 */, + void (*add_bytes_l2)(uint8_t *dst, uint8_t *src1 /* align 16 */, - uint8_t *src2 /* align 16 */, int w); + uint8_t *src2, int w); /* this might write to dst[w] */ void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src, diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index 8e23ccfbc6..678a032521 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i and waq, ~(mmsize*2-1) jmp .end_v .loop_v: - mova m0, [src1q+iq] - mova m1, [src1q+iq+mmsize] - paddb m0, [src2q+iq] - paddb m1, [src2q+iq+mmsize] - mova [dstq+iq ], m0 - mova [dstq+iq+mmsize], m1 + movu m0, [src2q+iq] + movu m1, [src2q+iq+mmsize] + paddb m0, [src1q+iq] + paddb m1, [src1q+iq+mmsize] + movu [dstq+iq ], m0 + movu [dstq+iq+mmsize], m1 add iq, mmsize*2 .end_v: cmp iq, waq |