diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz |
Initial WebM releasev0.9.0
Diffstat (limited to 'vp8/encoder')
103 files changed, 42694 insertions, 0 deletions
diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm new file mode 100644 index 000000000..608c9ae65 --- /dev/null +++ b/vp8/encoder/arm/armv6/walsh_v6.asm @@ -0,0 +1,144 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_short_walsh4x4_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) +|vp8_short_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + mov r12, r2 ; ugh. not clean + ldr r2, [r0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, r12]! ; [5 | 4] + ldr r5, [r0, #4] ; [7 | 6] + ldr r6, [r0, r12]! ; [9 | 8] + ldr r7, [r0, #4] ; [11 | 10] + ldr r8, [r0, r12]! ; [13 | 12] + ldr r9, [r0, #4] ; [15 | 14] + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [1 | 2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [0 | 3] [b1+a1 | d1-c1] + qaddsubx r4, r12, lr ; [5 | 6] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [4 | 7] [b1+a1 | d1-c1] + + qsubaddx r10, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r11, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r12, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx lr, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r10, r11 ; [9 |10] [c1+d1 | a1-b1] + qaddsubx r7, r11, r10 ; [8 |11] [b1+a1 | d1-c1] + qaddsubx r8, r12, lr ; [13|14] [c1+d1 | a1-b1] + qaddsubx r9, lr, r12 ; [12|15] [b1+a1 | d1-c1] + + ; first transform complete + + qadd16 r10, r3, r9 ; a1 [0+12 | 3+15] + qadd16 r11, r5, r7 ; b1 [4+8 | 7+11] + qsub16 r12, r5, r7 ; c1 [4-8 | 7-11] + qsub16 lr, r3, r9 ; d1 [0-12 | 3-15] + + qadd16 r3, r10, r11 ; a2 [a1+b1] [0 | 3] + qadd16 r5, r12, lr ; b2 [c1+d1] [4 | 7] + qsub16 r7, r10, r11 ; c2 [a1-b1] [8 |11] + qsub16 r9, lr, r12 ; d2 [d1-c1] [12|15] + + qadd16 r10, r2, r8 ; a1 [1+13 | 2+14] + qadd16 r11, r4, r6 ; b1 [5+9 | 6+10] + qsub16 r12, r4, r6 ; c1 [5-9 | 6-10] + qsub16 lr, r2, r8 ; d1 [1-13 | 2-14] + + qadd16 r2, r10, r11 ; a2 [a1+b1] [1 | 2] + qadd16 r4, r12, lr ; b2 [c1+d1] [5 | 6] + qsub16 r6, r10, r11 ; c2 [a1-b1] [9 |10] + qsub16 r8, lr, r12 ; d2 [d1-c1] [13|14] + + ; [a-d]2 += ([a-d]2 > 0) + + asrs r10, r3, #16 + addpl r10, r10, #1 ; [~0] + asrs r11, r2, #16 + addpl r11, r11, #1 ; [~1] + lsl r11, r11, #15 ; [1 | x] + pkhtb r10, r11, r10, asr #1; [1 | 0] + str r10, [r1], #4 + + lsls r11, r2, #16 + addpl r11, r11, #0x10000 ; [~2] + lsls r12, r3, #16 + addpl r12, r12, #0x10000 ; [~3] + asr r12, r12, #1 ; [3 | x] + pkhtb r11, r12, r11, asr #17; [3 | 2] + str r11, [r1], #4 + + asrs r2, r5, #16 + addpl r2, r2, #1 ; [~4] + asrs r3, r4, #16 + addpl r3, r3, #1 ; [~5] + lsl r3, r3, #15 ; [5 | x] + pkhtb r2, r3, r2, asr #1 ; [5 | 4] + str r2, [r1], #4 + + lsls r2, r4, #16 + addpl r2, r2, #0x10000 ; [~6] + lsls r3, r5, #16 + addpl r3, r3, #0x10000 ; [~7] + asr r3, r3, #1 ; [7 | x] + pkhtb r2, r3, r2, asr #17 ; [7 | 6] + str r2, [r1], #4 + + asrs r2, r7, #16 + addpl r2, r2, #1 ; [~8] + asrs r3, r6, #16 + addpl r3, r3, #1 ; [~9] + lsl r3, r3, #15 ; [9 | x] + pkhtb r2, r3, r2, asr #1 ; [9 | 8] + str r2, [r1], #4 + + lsls r2, r6, #16 + addpl r2, r2, #0x10000 ; [~10] + lsls r3, r7, #16 + addpl r3, r3, #0x10000 ; [~11] + asr r3, r3, #1 ; [11 | x] + pkhtb r2, r3, r2, asr #17 ; [11 | 10] + str r2, [r1], #4 + + asrs r2, r9, #16 + addpl r2, r2, #1 ; [~12] + asrs r3, r8, #16 + addpl r3, r3, #1 ; [~13] + lsl r3, r3, #15 ; [13 | x] + pkhtb r2, r3, r2, asr #1 ; [13 | 12] + str r2, [r1], #4 + + lsls r2, r8, #16 + addpl r2, r2, #0x10000 ; [~14] + lsls r3, r9, #16 + addpl r3, r3, #0x10000 ; [~15] + asr r3, r3, #1 ; [15 | x] + pkhtb r2, r3, r2, asr #17 ; [15 | 14] + str r2, [r1] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_walsh4x4_armv6| + + END diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c new file mode 100644 index 000000000..e70b3ad47 --- /dev/null +++ b/vp8/encoder/arm/boolhuff_arm.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "boolhuff.h" +#include "blockd.h" + +const unsigned int vp8_prob_cost[256] = +{ + 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, + 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, + 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, + 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, + 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, + 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, + 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, + 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, + 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, + 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, + 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, + 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, + 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, + 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, + 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 +}; + diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c new file mode 100644 index 000000000..003979680 --- /dev/null +++ b/vp8/encoder/arm/csystemdependent.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "variance.h" +#include "onyx_int.h" + +void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); + +void vp8_cmachine_specific_config(VP8_COMP *cpi) +{ +#if CONFIG_RUNTIME_CPU_DETECT + cpi->rtcd.common = &cpi->common.rtcd; + +#if HAVE_ARMV7 + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_neon; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_neon; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_neon; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_neon; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_neon; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_neon; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_neon; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_neon; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_neon; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_neon; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_neon; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_neon; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_neon; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_c; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_neon; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon; + + cpi->rtcd.encodemb.berr = vp8_block_error_c; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c; + cpi->rtcd.encodemb.subb = vp8_subtract_b_neon; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon; + + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; +#elif HAVE_ARMV6 + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_c; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_c; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_c; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6; + + cpi->rtcd.encodemb.berr = vp8_block_error_c; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c; + cpi->rtcd.encodemb.subb = vp8_subtract_b_c; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_c; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c; + + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; +#else + //pure c + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_c; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_c; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_c; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; + + cpi->rtcd.encodemb.berr = vp8_block_error_c; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c; + cpi->rtcd.encodemb.subb = vp8_subtract_b_c; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_c; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c; + + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; +#endif +#endif + +#if HAVE_ARMV7 + vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon; +#else + vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; +#endif +} diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h new file mode 100644 index 000000000..a671862fb --- /dev/null +++ b/vp8/encoder/arm/dct_arm.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef DCT_ARM_H +#define DCT_ARM_H + +#if HAVE_ARMV6 +extern prototype_fdct(vp8_short_walsh4x4_armv6); + +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6 +#endif + +#if HAVE_ARMV7 +extern prototype_fdct(vp8_short_fdct4x4_neon); +extern prototype_fdct(vp8_short_fdct8x4_neon); +extern prototype_fdct(vp8_fast_fdct4x4_neon); +extern prototype_fdct(vp8_fast_fdct8x4_neon); +extern prototype_fdct(vp8_short_walsh4x4_neon); + +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_neon + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon + +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon + +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon + +#endif + +#endif diff --git a/vp8/encoder/arm/encodemb_arm.c b/vp8/encoder/arm/encodemb_arm.c new file mode 100644 index 000000000..3f1d05391 --- /dev/null +++ b/vp8/encoder/arm/encodemb_arm.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "encodemb.h" +#include "reconinter.h" +#include "quantize.h" +#include "invtrans.h" +#include "recon.h" +#include "reconintra.h" +#include "dct.h" +#include "vpx_mem/vpx_mem.h" + +extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch); + +void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *src_ptr = (*(be->base_src) + be->src); + short *diff_ptr = be->src_diff; + unsigned char *pred_ptr = bd->predictor; + int src_stride = be->src_stride; + + vp8_subtract_b_neon_func(diff_ptr, src_ptr, pred_ptr, src_stride, pitch); +} diff --git a/vp8/encoder/arm/encodemb_arm.h b/vp8/encoder/arm/encodemb_arm.h new file mode 100644 index 000000000..28f9e5c5f --- /dev/null +++ b/vp8/encoder/arm/encodemb_arm.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef ENCODEMB_ARM_H +#define ENCODEMB_ARM_H + +#if HAVE_ARMV7 +//extern prototype_berr(vp8_block_error_c); +//extern prototype_mberr(vp8_mbblock_error_c); +//extern prototype_mbuverr(vp8_mbuverror_c); + +extern prototype_subb(vp8_subtract_b_neon); +extern prototype_submby(vp8_subtract_mby_neon); +extern prototype_submbuv(vp8_subtract_mbuv_neon); + +//#undef vp8_encodemb_berr +//#define vp8_encodemb_berr vp8_block_error_c + +//#undef vp8_encodemb_mberr +//#define vp8_encodemb_mberr vp8_mbblock_error_c + +//#undef vp8_encodemb_mbuverr +//#define vp8_encodemb_mbuverr vp8_mbuverror_c + +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_neon + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_neon + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_neon + +#endif + +#endif diff --git a/vp8/encoder/arm/mcomp_arm.c b/vp8/encoder/arm/mcomp_arm.c new file mode 100644 index 000000000..07f218605 --- /dev/null +++ b/vp8/encoder/arm/mcomp_arm.c @@ -0,0 +1,1662 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "mcomp.h" +#include "vpx_mem/vpx_mem.h" + +#include <stdio.h> +#include <limits.h> +#include <math.h> + +#ifdef ENTROPY_STATS +static int mv_ref_ct [31] [4] [2]; +static int mv_mode_cts [4] [2]; +#endif + +static int mv_bits_sadcost[256]; + +extern unsigned int vp8_sub_pixel_variance16x16s_neon +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); +extern unsigned int vp8_sub_pixel_variance16x16s_4_0_neon +( + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); +extern unsigned int vp8_sub_pixel_variance16x16s_0_4_neon +( + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); +extern unsigned int vp8_sub_pixel_variance16x16s_4_4_neon +( + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); + +void vp8cx_init_mv_bits_sadcost() +{ + int i; + + for (i = 0; i < 256; i++) + { + mv_bits_sadcost[i] = (int)sqrt(i * 16); + } +} + + +int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight) +{ + // MV costing is based on the distribution of vectors in the previous frame and as such will tend to + // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the + // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks. + // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors. + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7; +} + +int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit) +{ + //int i; + //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8; + //return ( (vp8_mv_bit_cost(mv, ref, mvcost, 100) + 128) * error_per_bit) >> 8; + + //i = (vp8_mv_bit_cost(mv, ref, mvcost, 100) * error_per_bit + 128) >> 8; + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8; + //return (vp8_mv_bit_cost(mv, ref, mvcost, 128) * error_per_bit + 128) >> 8; +} + + +static int mv_bits(MV *mv, MV *ref, int *mvcost[2]) +{ + // get the estimated number of bits for a motion vector, to be used for costing in SAD based + // motion estimation + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8; +} + +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride) +{ + int Len; + int search_site_count = 0; + + + // Generate offsets for 4 search sites per step. + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) + { + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + // Contract. + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 4; +} + +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) +{ + int Len; + int search_site_count = 0; + + // Generate offsets for 8 search sites per step. + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) + { + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride - Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride + Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride - Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride + Len; + search_site_count++; + + + // Contract. + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 8; +} + + +#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) +#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector +#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc +#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. +#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; +#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost +#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best +#define MIN(x,y) (((x)<(y))?(x):(y)) +#define MAX(x,y) (((x)>(y))?(x):(y)) + +//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; } + +int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + + int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1; + int br = bestmv->row << 2, bc = bestmv->col << 2; + int tr = br, tc = bc; + unsigned int besterr = INT_MAX; + unsigned int left, right, up, down, diag; + unsigned int sse; + unsigned int whichdir; + unsigned int halfiters = 4; + unsigned int quarteriters = 4; + + int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1)); + int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1)); + int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1)); + int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1)); + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + + // calculate central point error + besterr = vf(y, d->pre_stride, z, b->src_stride, &sse); + besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected) + while (--halfiters) + { + // 1/2 pel + CHECK_BETTER(left, tr, tc - 2); + CHECK_BETTER(right, tr, tc + 2); + CHECK_BETTER(up, tr - 2, tc); + CHECK_BETTER(down, tr + 2, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) + { + case 0: + CHECK_BETTER(diag, tr - 2, tc - 2); + break; + case 1: + CHECK_BETTER(diag, tr - 2, tc + 2); + break; + case 2: + CHECK_BETTER(diag, tr + 2, tc - 2); + break; + case 3: + CHECK_BETTER(diag, tr + 2, tc + 2); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected) + // 1/4 pel + while (--quarteriters) + { + CHECK_BETTER(left, tr, tc - 1); + CHECK_BETTER(right, tr, tc + 1); + CHECK_BETTER(up, tr - 1, tc); + CHECK_BETTER(down, tr + 1, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) + { + case 0: + CHECK_BETTER(diag, tr - 1, tc - 1); + break; + case 1: + CHECK_BETTER(diag, tr - 1, tc + 1); + break; + case 2: + CHECK_BETTER(diag, tr + 1, tc - 1); + break; + case 3: + CHECK_BETTER(diag, tr + 1, tc + 1); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + bestmv->row = br << 1; + bestmv->col = bc << 1; + + if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL)) + return INT_MAX; + + return besterr; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef ERR +#undef CHECK_BETTER +#undef MIN +#undef MAX +int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + int bestmse = INT_MAX; + MV startmv; + //MV this_mv; + MV this_mv; + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + int whichdir ; + + + // Trap uncodable vectors + if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL)) + { + bestmv->row <<= 3; + bestmv->col <<= 3; + return INT_MAX; + } + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + startmv = *bestmv; + + // calculate central point error + bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // go left then right and check error + this_mv.row = startmv.row; + this_mv.col = ((startmv.col - 8) | 4); + left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse); + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 8; + right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + this_mv.row = ((startmv.row - 8) | 4); + up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 8; + down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + + // now check 1 more diagonal + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + //for(whichdir =0;whichdir<4;whichdir++) + //{ + this_mv = startmv; + + switch (whichdir) + { + case 0: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + break; + case 1: + this_mv.col += 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + break; + case 2: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row += 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse); + break; + case 3: + this_mv.col += 4; + this_mv.row += 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +// } + + + // time to check quarter pels. + if (bestmv->row < startmv.row) + y -= d->pre_stride; + + if (bestmv->col < startmv.col) + y--; + + startmv = *bestmv; + + + + // go left then right and check error + this_mv.row = startmv.row; + + if (startmv.col & 7) + { + this_mv.col = startmv.col - 2; + left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse); + } + + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 4; + right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + + if (startmv.row & 7) + { + this_mv.row = startmv.row - 2; + up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.row = (startmv.row - 8) | 6; + up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 4; + down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + + // now check 1 more diagonal + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + +// for(whichdir=0;whichdir<4;whichdir++) +// { + this_mv = startmv; + + switch (whichdir) + { + case 0: + + if (startmv.row & 7) + { + this_mv.row -= 2; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + } + } + else + { + this_mv.row = (startmv.row - 8) | 6; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse); + } + } + + break; + case 1: + this_mv.col += 2; + + if (startmv.row & 7) + { + this_mv.row -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.row = (startmv.row - 8) | 6; + diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + + break; + case 2: + this_mv.row += 2; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + } + + break; + case 3: + this_mv.col += 2; + this_mv.row += 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +// } + + return bestmse; +} + +int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + int bestmse = INT_MAX; + MV startmv; + //MV this_mv; + MV this_mv; + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + + // Trap uncodable vectors + if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL)) + { + bestmv->row <<= 3; + bestmv->col <<= 3; + return INT_MAX; + } + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + startmv = *bestmv; + + // calculate central point error + bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // go left then right and check error + this_mv.row = startmv.row; + this_mv.col = ((startmv.col - 8) | 4); + left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse); + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 8; + right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + this_mv.row = ((startmv.row - 8) | 4); + up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 8; + down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + // somewhat strangely not doing all the diagonals for half pel is slower than doing them. +#if 0 + // now check 1 more diagonal - + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + this_mv = startmv; + + switch (whichdir) + { + case 0: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 1: + this_mv.col += 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 2: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row += 4; + diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 3: + this_mv.col += 4; + this_mv.row += 4; + diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +#else + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col += 8; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = startmv.row + 4; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col += 8; + diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +#endif + return bestmse; +} + +#if 1 + +#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) +#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector +#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score. +#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost +#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best +const MV next_chkpts[6][3] = +{ + {{ -2, 0}, { -1, -2}, {1, -2}}, + {{ -1, -2}, {1, -2}, {2, 0}}, + {{1, -2}, {2, 0}, {1, 2}}, + {{2, 0}, {1, 2}, { -1, 2}}, + {{1, 2}, { -1, 2}, { -2, 0}}, + {{ -1, 2}, { -2, 0}, { -1, -2}} +}; +int vp8_hex_search +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_t vf, + vp8_sad_fn_t sf, + int *mvsadcost[2], + int *mvcost[2] +) +{ + MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ; + MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ; + int i, j; + unsigned char *src = (*(b->base_src) + b->src); + int src_stride = b->src_stride; + int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc; + unsigned int besterr, thiserr = 0x7fffffff; + int k = -1, tk; + + if (bc < x->mv_col_min) bc = x->mv_col_min; + + if (bc > x->mv_col_max) bc = x->mv_col_max; + + if (br < x->mv_row_min) br = x->mv_row_min; + + if (br > x->mv_row_max) br = x->mv_row_max; + + rr >>= 1; + rc >>= 1; + + besterr = ERR(br, bc, thiserr); + + // hex search + //j=0 + tr = br; + tc = bc; + + for (i = 0; i < 6; i++) + { + int nr = tr + hex[i].row, nc = tc + hex[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + //CHECK_BETTER(thiserr,nr,nc); + if ((thiserr = ERR(nr, nc, besterr)) < besterr) + { + besterr = thiserr; + br = nr; + bc = nc; + k = i; + } + } + + if (tr == br && tc == bc) + goto cal_neighbors; + + for (j = 1; j < 127; j++) + { + tr = br; + tc = bc; + tk = k; + + for (i = 0; i < 3; i++) + { + int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + //CHECK_BETTER(thiserr,nr,nc); + if ((thiserr = ERR(nr, nc, besterr)) < besterr) + { + besterr = thiserr; + br = nr; + bc = nc; //k=(tk+5+i)%6;} + k = tk + 5 + i; + + if (k >= 12) k -= 12; + else if (k >= 6) k -= 6; + } + } + + if (tr == br && tc == bc) + break; + } + + // check 8 1 away neighbors +cal_neighbors: + tr = br; + tc = bc; + + for (i = 0; i < 8; i++) + { + int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + best_mv->row = br; + best_mv->col = bc; + + return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef ERR +#undef CHECK_BETTER + +#else + +#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) +#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector +#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score. +#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost +#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best + +int vp8_hex_search +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_t vf, + vp8_sad_fn_t sf, + int *mvsadcost[2], + int *mvcost[2] +) +{ + MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ; + MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ; + int i, j; + unsigned char *src = (*(b->base_src) + b->src); + int src_stride = b->src_stride; + //int rr= ref_mv->row,rc= ref_mv->col,br=rr,bc=rc,tr,tc; + int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc; + unsigned int besterr, thiserr = 0x7fffffff; + + /* + if ( rc < x->mv_col_min) bc = x->mv_col_min; + if ( rc > x->mv_col_max) bc = x->mv_col_max; + if ( rr < x->mv_row_min) br = x->mv_row_min; + if ( rr > x->mv_row_max) br = x->mv_row_max; + rr>>=1; + rc>>=1; + br>>=3; + bc>>=3; + */ + if (bc < x->mv_col_min) bc = x->mv_col_min; + + if (bc > x->mv_col_max) bc = x->mv_col_max; + + if (br < x->mv_row_min) br = x->mv_row_min; + + if (br > x->mv_row_max) br = x->mv_row_max; + + rr >>= 1; + rc >>= 1; + + besterr = ERR(br, bc, thiserr); + + // hex search jbb changed to 127 to avoid max 256 problem steping by 2. + for (j = 0; j < 127; j++) + { + tr = br; + tc = bc; + + for (i = 0; i < 6; i++) + { + int nr = tr + hex[i].row, nc = tc + hex[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + if (tr == br && tc == bc) + break; + } + + // check 8 1 away neighbors + tr = br; + tc = bc; + + for (i = 0; i < 8; i++) + { + int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + best_mv->row = br; + best_mv->col = bc; + + return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef ERR +#undef CHECK_BETTER + +#endif + +int vp8_diamond_search_sad +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvsadcost[2], + int *mvcost[2] +) +{ + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + unsigned char *best_address; + + int tot_steps; + MV this_mv; + + int bestsad = INT_MAX; + int best_site = 0; + int last_site = 0; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + int thissad; + + // Work out the start point for the search + in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col); + best_address = in_what; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // search_param determines the length of the initial step and hence the number of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + best_mv->row = ref_row; + best_mv->col = ref_col; + + *num00 = 0; + + for (step = 0; step < tot_steps ; step++) + { + for (j = 0 ; j < x->searches_per_step ; j++) + { + // Trap illegal vectors + this_row_offset = best_mv->row + ss[i].mv.row; + this_col_offset = best_mv->col + ss[i].mv.col; + + if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) + + { + check_here = ss[i].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_mv.row = this_row_offset << 3; + this_mv.col = this_col_offset << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + + i++; + } + + if (best_site != last_site) + { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } + else if (best_address == in_what) + (*num00)++; + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad == INT_MAX) + return INT_MAX; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); +} + +int vp8_diamond_search_sadx4 +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvsadcost[2], + int *mvcost[2] +) +{ + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + unsigned char *best_address; + + int tot_steps; + MV this_mv; + + int bestsad = INT_MAX; + int best_site = 0; + int last_site = 0; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + int thissad; + + // Work out the start point for the search + in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col); + best_address = in_what; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // search_param determines the length of the initial step and hence the number of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + best_mv->row = ref_row; + best_mv->col = ref_col; + + *num00 = 0; + + for (step = 0; step < tot_steps ; step++) + { + int check_row_min, check_col_min, check_row_max, check_col_max; + + check_row_min = x->mv_row_min - best_mv->row; + check_row_max = x->mv_row_max - best_mv->row; + check_col_min = x->mv_col_min - best_mv->col; + check_col_max = x->mv_col_max - best_mv->col; + + for (j = 0 ; j < x->searches_per_step ; j += 4) + { + char *block_offset[4]; + unsigned int valid_block[4]; + int all_in = 1, t; + + for (t = 0; t < 4; t++) + { + valid_block [t] = (ss[t+i].mv.col > check_col_min); + valid_block [t] &= (ss[t+i].mv.col < check_col_max); + valid_block [t] &= (ss[t+i].mv.row > check_row_min); + valid_block [t] &= (ss[t+i].mv.row < check_row_max); + + all_in &= valid_block[t]; + block_offset[t] = ss[i+t].offset + best_address; + } + + if (all_in) + { + int sad_array[4]; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array); + + for (t = 0; t < 4; t++, i++) + { + thissad = sad_array[t]; + + if (thissad < bestsad) + { + this_mv.row = (best_mv->row + ss[i].mv.row) << 3; + this_mv.col = (best_mv->col + ss[i].mv.col) << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + } + else + { + int t; + + for (t = 0; t < 4; i++, t++) + { + // Trap illegal vectors + if (valid_block[t]) + + { + check_here = block_offset[t]; + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_row_offset = best_mv->row + ss[i].mv.row; + this_col_offset = best_mv->col + ss[i].mv.col; + + this_mv.row = this_row_offset << 3; + this_mv.col = this_col_offset << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + } + } + } + + if (best_site != last_site) + { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } + else if (best_address == in_what) + (*num00)++; + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad == INT_MAX) + return INT_MAX; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); +} + + +#if !(CONFIG_REALTIME_ONLY) +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2]) +{ + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + int mv_stride = d->pre_stride; + unsigned char *bestaddress; + MV *best_mv = &d->bmi.mv.as_mv; + MV this_mv; + int bestsad = INT_MAX; + int r, c; + + unsigned char *check_here; + int thissad; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + // Work out the mid point for the search + in_what = *(d->base_pre) + d->pre; + bestaddress = in_what + (ref_row * d->pre_stride) + ref_col; + + best_mv->row = ref_row; + best_mv->col = ref_col; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Baseline value at the centre + + //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14)); + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border + if (col_min < x->mv_col_min) + col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) + col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) + row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) + row_max = x->mv_row_max; + + for (r = row_min; r < row_max ; r++) + { + this_mv.row = r << 3; + check_here = r * mv_stride + in_what + col_min; + + for (c = col_min; c < col_max; c++) + { + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + this_mv.col = c << 3; + //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14)); + //thissad += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)]; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + + check_here++; + } + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad < INT_MAX) + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + else + return INT_MAX; +} + +int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2]) +{ + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + int mv_stride = d->pre_stride; + unsigned char *bestaddress; + MV *best_mv = &d->bmi.mv.as_mv; + MV this_mv; + int bestsad = INT_MAX; + int r, c; + + unsigned char *check_here; + int thissad; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + int sad_array[3]; + + // Work out the mid point for the search + in_what = *(d->base_pre) + d->pre; + bestaddress = in_what + (ref_row * d->pre_stride) + ref_col; + + best_mv->row = ref_row; + best_mv->col = ref_col; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Baseline value at the centre + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border + if (col_min < x->mv_col_min) + col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) + col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) + row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) + row_max = x->mv_row_max; + + for (r = row_min; r < row_max ; r++) + { + this_mv.row = r << 3; + check_here = r * mv_stride + in_what + col_min; + c = col_min; + + while ((c + 3) < col_max) + { + int i; + + fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array); + + for (i = 0; i < 3; i++) + { + thissad = sad_array[i]; + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here++; + c++; + } + } + + while (c < col_max) + { + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here ++; + c ++; + } + + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad < INT_MAX) + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + else + return INT_MAX; +} +#endif + +#ifdef ENTROPY_STATS +void print_mode_context(void) +{ + FILE *f = fopen("modecont.c", "w"); + int i, j; + + fprintf(f, "#include \"entropy.h\"\n"); + fprintf(f, "const int vp8_mode_contexts[6][4] =\n"); + fprintf(f, "{\n"); + + for (j = 0; j < 6; j++) + { + fprintf(f, " { // %d \n", j); + fprintf(f, " "); + + for (i = 0; i < 4; i++) + { + int overal_prob; + int this_prob; + int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1]; + + // Overall probs + count = mv_mode_cts[i][0] + mv_mode_cts[i][1]; + + if (count) + overal_prob = 256 * mv_mode_cts[i][0] / count; + else + overal_prob = 128; + + if (overal_prob == 0) + overal_prob = 1; + + // context probs + count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; + + if (count) + this_prob = 256 * mv_ref_ct[j][i][0] / count; + else + this_prob = 128; + + if (this_prob == 0) + this_prob = 1; + + fprintf(f, "%5d, ", this_prob); + //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob); + //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob); + } + + fprintf(f, " },\n"); + } + + fprintf(f, "};\n"); + fclose(f); +} + +/* MV ref count ENTROPY_STATS stats code */ +#ifdef ENTROPY_STATS +void init_mv_ref_counts() +{ + vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); + vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); +} + +void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) +{ + if (m == ZEROMV) + { + ++mv_ref_ct [ct[0]] [0] [0]; + ++mv_mode_cts[0][0]; + } + else + { + ++mv_ref_ct [ct[0]] [0] [1]; + ++mv_mode_cts[0][1]; + + if (m == NEARESTMV) + { + ++mv_ref_ct [ct[1]] [1] [0]; + ++mv_mode_cts[1][0]; + } + else + { + ++mv_ref_ct [ct[1]] [1] [1]; + ++mv_mode_cts[1][1]; + + if (m == NEARMV) + { + ++mv_ref_ct [ct[2]] [2] [0]; + ++mv_mode_cts[2][0]; + } + else + { + ++mv_ref_ct [ct[2]] [2] [1]; + ++mv_mode_cts[2][1]; + + if (m == NEWMV) + { + ++mv_ref_ct [ct[3]] [3] [0]; + ++mv_mode_cts[3][0]; + } + else + { + ++mv_ref_ct [ct[3]] [3] [1]; + ++mv_mode_cts[3][1]; + } + } + } + } +} + +#endif/* END MV ref count ENTROPY_STATS stats code */ + +#endif diff --git a/vp8/encoder/arm/neon/boolhuff_armv7.asm b/vp8/encoder/arm/neon/boolhuff_armv7.asm new file mode 100644 index 000000000..9a5f36661 --- /dev/null +++ b/vp8/encoder/arm/neon/boolhuff_armv7.asm @@ -0,0 +1,292 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_start_encode| + EXPORT |vp8_encode_bool| + EXPORT |vp8_stop_encode| + EXPORT |vp8_encode_value| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 BOOL_CODER *br +; r1 unsigned char *source + +|vp8_start_encode| PROC + mov r12, #0 + mov r3, #255 + mvn r2, #23 + str r12, [r0, #vp8_writer_lowvalue] + str r3, [r0, #vp8_writer_range] + str r12, [r0, #vp8_writer_value] + str r2, [r0, #vp8_writer_count] + str r12, [r0, #vp8_writer_pos] + str r1, [r0, #vp8_writer_buffer] + bx lr + ENDP + +; r0 BOOL_CODER *br +; r1 int bit +; r2 int probability +|vp8_encode_bool| PROC + push {r4-r9, lr} + + mov r4, r2 + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + sub r7, r5, #1 ; range-1 + + cmp r1, #0 + mul r4, r4, r7 ; ((range-1) * probability) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * probability) >> 8) + + addne r2, r2, r4 ; if (bit) lowvalue += split + subne r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r9, pc} + ENDP + +; r0 BOOL_CODER *br +|vp8_stop_encode| PROC + push {r4-r10, lr} + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + mov r10, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r1, [r7, r4] + cmpge r1, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r1, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r1, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne stop_encode_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r10, pc} + + ENDP + +; r0 BOOL_CODER *br +; r1 int data +; r2 int bits +|vp8_encode_value| PROC + push {r4-r11, lr} + + mov r10, r2 + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r11, r1 + rsb r4, r10, #32 ; 32-n + + ; v is kept in r1 during the token pack loop + lsr r1, r11, r4 ; v >>= 32 - n + +encode_value_loop + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r1, r1, #1 ; bit = v >> n + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + addcs r2, r2, r4 ; if (bit) lowvalue += split + subcs r4, r5, r4 ; if (bit) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_ev ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_ev + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_ev +token_zero_while_loop_ev + mov r9, #0 + strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_ev + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_ev + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r9, [r7, r4] ; w->buffer[x] + add r9, r9, #1 + strb r9, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_ev + rsb r4, r6, #24 ; 24-offset + ldr r9, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r9, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_ev + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r10, r10, #1 + bne encode_value_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + pop {r4-r11, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm new file mode 100644 index 000000000..d5dec440d --- /dev/null +++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm @@ -0,0 +1,126 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_fdct4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +;NOTE: +;The input *src_diff. src_diff is calculated as: +;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) +;In which *src_ptr and *pred_ptr both are unsigned char. +;Therefore, *src_diff should be in the range of [-255, 255]. +;CAUTION: +;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. +;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes +;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. + +|vp8_fast_fdct4x4_neon| PROC + vld1.16 {d2}, [r0], r2 ;load input + ldr r12, _ffdct_coeff_ + vld1.16 {d3}, [r0], r2 + vld1.16 {d4}, [r0], r2 + vld1.16 {d0}, [r12] + vld1.16 {d5}, [r0], r2 + + ;First for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;ip[0]+ip[3] + vadd.s16 d7, d3, d4 ;ip[1]+ip[2] + vsub.s16 d8, d3, d4 ;ip[1]-ip[2] + vsub.s16 d9, d2, d5 ;ip[0]-ip[3] + vshl.i16 q3, q3, #1 ; a1, b1 + vshl.i16 q4, q4, #1 ; c1, d1 + + vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 + vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 + + vqdmulh.s16 q6, q5, d0[1] + vqdmulh.s16 q8, q4, d0[0] + vqdmulh.s16 q7, q4, d0[2] + + vshr.s16 q6, q6, #1 + vshr.s16 q8, q8, #1 + vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 + vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2 + + ;Second for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] + vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] + vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] + + vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 + vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 + + + vqdmulh.s16 q6, q5, d0[1] + vqdmulh.s16 q8, q4, d0[0] + vqdmulh.s16 q7, q4, d0[2] + + vshr.s16 q6, q6, #1 + vshr.s16 q8, q8, #1 + vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 + vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2 + + vclt.s16 q3, q1, #0 + vclt.s16 q4, q2, #0 + + vsub.s16 q1, q1, q3 + vsub.s16 q2, q2, q4 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + + vst1.16 {q1, q2}, [r1] + + bx lr + + ENDP + +;----------------- + AREA fastfdct_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_ffdct_coeff_ + DCD ffdct_coeff +ffdct_coeff +; 60547 = 0xEC83 +; 46341 = 0xB505 +; 25080 = 0x61F8 + DCD 0xB505EC83, 0x000061F8 + + END diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm new file mode 100644 index 000000000..de1c25469 --- /dev/null +++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm @@ -0,0 +1,179 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_fdct8x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +;NOTE: +;The input *src_diff. src_diff is calculated as: +;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) +;In which *src_ptr and *pred_ptr both are unsigned char. +;Therefore, *src_diff should be in the range of [-255, 255]. +;CAUTION: +;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. +;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes +;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. + +|vp8_fast_fdct8x4_neon| PROC + vld1.16 {q1}, [r0], r2 ;load input + ldr r12, _ffdct8_coeff_ + vld1.16 {q2}, [r0], r2 + vld1.16 {q3}, [r0], r2 + vld1.16 {d0}, [r12] + vld1.16 {q4}, [r0], r2 + + ;First for-loop + ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3] + ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3] + vtrn.32 d2, d6 + vtrn.32 d3, d7 + vtrn.32 d4, d8 + vtrn.32 d5, d9 + vtrn.16 d2, d4 + vtrn.16 d3, d5 + vtrn.16 d6, d8 + vtrn.16 d7, d9 + + vadd.s16 d10, d2, d8 ;ip[0]+ip[3] + vadd.s16 d11, d4, d6 ;ip[1]+ip[2] + vsub.s16 d12, d4, d6 ;ip[1]-ip[2] + vsub.s16 d13, d2, d8 ;ip[0]-ip[3] + vadd.s16 d22, d3, d9 + vadd.s16 d23, d5, d7 + vsub.s16 d24, d5, d7 + vsub.s16 d25, d3, d9 + + vshl.i16 q5, q5, #1 ; a1, b1 + vshl.i16 q6, q6, #1 ; c1, d1 + vshl.i16 q1, q11, #1 + vshl.i16 q2, q12, #1 + + vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 + vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 + vadd.s16 d24, d2, d3 + vsub.s16 d25, d2, d3 + + vqdmulh.s16 q8, q7, d0[1] + vqdmulh.s16 q13, q12, d0[1] + vqdmulh.s16 q10, q6, d0[0] + vqdmulh.s16 q15, q2, d0[0] + vqdmulh.s16 q9, q6, d0[2] + vqdmulh.s16 q14, q2, d0[2] + + vshr.s16 q8, q8, #1 + vshr.s16 q13, q13, #1 + vshr.s16 q10, q10, #1 + vshr.s16 q15, q15, #1 + vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 + vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 + vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 + vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2 + vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2 + + ;Second for-loop + ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12] + ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12] + vtrn.32 d2, d6 + vtrn.32 d3, d7 + vtrn.32 d4, d8 + vtrn.32 d5, d9 + vtrn.16 d2, d4 + vtrn.16 d3, d5 + vtrn.16 d6, d8 + vtrn.16 d7, d9 + + vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12] + vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8] + vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8] + vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12] + vadd.s16 d2, d3, d9 + vadd.s16 d4, d5, d7 + vsub.s16 d24, d5, d7 + vsub.s16 d25, d3, d9 + + vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 + vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 + vadd.s16 d22, d2, d4 + vsub.s16 d23, d2, d4 + + vqdmulh.s16 q8, q7, d0[1] + vqdmulh.s16 q13, q11, d0[1] + vqdmulh.s16 q10, q6, d0[0] + vqdmulh.s16 q15, q12, d0[0] + vqdmulh.s16 q9, q6, d0[2] + vqdmulh.s16 q14, q12, d0[2] + + vshr.s16 q8, q8, #1 + vshr.s16 q13, q13, #1 + vshr.s16 q10, q10, #1 + vshr.s16 q15, q15, #1 + vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 + vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 + vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 + vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 + + vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1 + vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2 + vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection + vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2 + vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2 + + vclt.s16 q5, q1, #0 + vclt.s16 q6, q2, #0 + vclt.s16 q7, q3, #0 + vclt.s16 q8, q4, #0 + + vsub.s16 q1, q1, q5 + vsub.s16 q2, q2, q6 + vsub.s16 q3, q3, q7 + vsub.s16 q4, q4, q8 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vst1.16 {q1, q2}, [r1]! + vst1.16 {q3, q4}, [r1] + + bx lr + + ENDP + +;----------------- + AREA fastfdct8x4_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_ffdct8_coeff_ + DCD ffdct8_coeff +ffdct8_coeff +; 60547 = 0xEC83 +; 46341 = 0xB505 +; 25080 = 0x61F8 + DCD 0xB505EC83, 0x000061F8 + + END diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm new file mode 100644 index 000000000..11070377b --- /dev/null +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 short *coeff_ptr +; r1 short *zbin_ptr +; r2 short *qcoeff_ptr +; r3 short *dqcoeff_ptr +; stack short *dequant_ptr +; stack short *scan_mask +; stack short *round_ptr +; stack short *quant_ptr + +; return int * eob +|vp8_fast_quantize_b_neon_func| PROC + vld1.16 {q0, q1}, [r0] ;load z + vld1.16 {q10, q11}, [r1] ;load zbin + + vabs.s16 q4, q0 ;calculate x = abs(z) + vabs.s16 q5, q1 + + vcge.s16 q10, q4, q10 ;x>=zbin + vcge.s16 q11, q5, q11 + + ;if x<zbin (q10 & q11 are all 0), go to zero_output + vorr.s16 q6, q10, q11 + vorr.s16 d12, d12, d13 + vmov r0, r1, d12 + orr r0, r0, r1 + cmp r0, #0 + beq zero_output + + ldr r0, [sp, #8] ;load round_ptr + ldr r12, [sp, #12] ;load quant_ptr + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15] + vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15] + + vadd.s16 q4, q6 ;x + Round + vadd.s16 q5, q7 + + ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr + + vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order + vceq.s16 q8, q8 ;set q8 to all 1 + + vshr.s16 q4, #1 ;right shift 1 after vqdmulh + vshr.s16 q5, #1 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + ldr r12, [sp] ;load dequant_ptr + + vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (-1) (two's complement) + vsub.s16 q5, q3 + + vand.s16 q4, q10 ;mask off x1 elements + vand.s16 q5, q11 + + vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i] + + vtst.16 q14, q4, q8 ;now find eob + vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5 + + vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1 + + vand q0, q0, q14 ;get all valid number from rvsplus1_scan_order array + vand q1, q1, q15 + + vmax.u16 q0, q0, q1 ;find maximum value in q0, q1 + vmax.u16 d0, d0, d1 + vmovl.u16 q0, d0 + + vmul.s16 q6, q4 ;x * Dequant + vmul.s16 q7, q5 + + vmax.u32 d0, d0, d1 + vpmax.u32 d0, d0, d0 + + vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant + + vmov.32 r0, d0[0] + bx lr + +zero_output + vst1.s16 {q10, q11}, [r2] ; qcoeff = 0 + vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0 + mov r0, #0 + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/sad16_neon.asm b/vp8/encoder/arm/neon/sad16_neon.asm new file mode 100644 index 000000000..6169f10da --- /dev/null +++ b/vp8/encoder/arm/neon/sad16_neon.asm @@ -0,0 +1,206 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_neon| + EXPORT |vp8_sad16x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_sad16x16_neon| PROC +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + +;; + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0] + vld1.8 {q7}, [r2] + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================== +;unsigned int vp8_sad16x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +|vp8_sad16x8_neon| PROC + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabdl.u8 q12, d0, d8 + vabdl.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vld1.8 {q0}, [r0], r1 + vld1.8 {q4}, [r2], r3 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vld1.8 {q1}, [r0], r1 + vld1.8 {q5}, [r2], r3 + + vabal.u8 q12, d0, d8 + vabal.u8 q13, d1, d9 + + vld1.8 {q2}, [r0], r1 + vld1.8 {q6}, [r2], r3 + + vabal.u8 q12, d2, d10 + vabal.u8 q13, d3, d11 + + vld1.8 {q3}, [r0], r1 + vld1.8 {q7}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q13, d5, d13 + + vabal.u8 q12, d6, d14 + vabal.u8 q13, d7, d15 + + vadd.u16 q0, q12, q13 + + vpaddl.u16 q1, q0 + vpaddl.u32 q0, q1 + + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/sad8_neon.asm b/vp8/encoder/arm/neon/sad8_neon.asm new file mode 100644 index 000000000..28604ddeb --- /dev/null +++ b/vp8/encoder/arm/neon/sad8_neon.asm @@ -0,0 +1,208 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sad8x8_neon| + EXPORT |vp8_sad8x16_neon| + EXPORT |vp8_sad4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; unsigned int vp8_sad8x8_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x8_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +;unsigned int vp8_sad8x16_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad8x16_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vabal.u8 q12, d6, d14 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabal.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 q1, q12 + vpaddl.u32 q0, q1 + vadd.u32 d0, d0, d1 + + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;=========================== +;unsigned int vp8_sad4x4_c( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) + +|vp8_sad4x4_neon| PROC + vld1.8 {d0}, [r0], r1 + vld1.8 {d8}, [r2], r3 + + vld1.8 {d2}, [r0], r1 + vld1.8 {d10}, [r2], r3 + + vabdl.u8 q12, d0, d8 + + vld1.8 {d4}, [r0], r1 + vld1.8 {d12}, [r2], r3 + + vabal.u8 q12, d2, d10 + + vld1.8 {d6}, [r0], r1 + vld1.8 {d14}, [r2], r3 + + vabal.u8 q12, d4, d12 + vabal.u8 q12, d6, d14 + + vpaddl.u16 d1, d24 + vpaddl.u32 d0, d1 + vmov.32 r0, d0[0] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm new file mode 100644 index 000000000..26bc0d06c --- /dev/null +++ b/vp8/encoder/arm/neon/shortfdct_neon.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_fdct4x4_neon| + EXPORT |vp8_short_fdct8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 short *input +; r1 short *output +; r2 int pitch +; Input has a pitch, output is contiguous +|vp8_short_fdct4x4_neon| PROC + ldr r12, _dct_matrix_ + vld1.16 d0, [r0], r2 + vld1.16 d1, [r0], r2 + vld1.16 d2, [r0], r2 + vld1.16 d3, [r0] + vld1.16 {q2, q3}, [r12] + +;first stage + vmull.s16 q11, d4, d0[0] ;i=0 + vmull.s16 q12, d4, d1[0] ;i=1 + vmull.s16 q13, d4, d2[0] ;i=2 + vmull.s16 q14, d4, d3[0] ;i=3 + + vmlal.s16 q11, d5, d0[1] + vmlal.s16 q12, d5, d1[1] + vmlal.s16 q13, d5, d2[1] + vmlal.s16 q14, d5, d3[1] + + vmlal.s16 q11, d6, d0[2] + vmlal.s16 q12, d6, d1[2] + vmlal.s16 q13, d6, d2[2] + vmlal.s16 q14, d6, d3[2] + + vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0 + vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1 + vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2 + vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3 + + ; rounding + vrshrn.i32 d22, q11, #14 + vrshrn.i32 d24, q12, #14 + vrshrn.i32 d26, q13, #14 + vrshrn.i32 d28, q14, #14 + +;second stage + vmull.s16 q4, d22, d4[0] ;i=0 + vmull.s16 q5, d22, d4[1] ;i=1 + vmull.s16 q6, d22, d4[2] ;i=2 + vmull.s16 q7, d22, d4[3] ;i=3 + + vmlal.s16 q4, d24, d5[0] + vmlal.s16 q5, d24, d5[1] + vmlal.s16 q6, d24, d5[2] + vmlal.s16 q7, d24, d5[3] + + vmlal.s16 q4, d26, d6[0] + vmlal.s16 q5, d26, d6[1] + vmlal.s16 q6, d26, d6[2] + vmlal.s16 q7, d26, d6[3] + + vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0 + vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1 + vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2 + vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3 + + vrshr.s32 q0, q4, #16 + vrshr.s32 q1, q5, #16 + vrshr.s32 q2, q6, #16 + vrshr.s32 q3, q7, #16 + + vmovn.i32 d0, q0 + vmovn.i32 d1, q1 + vmovn.i32 d2, q2 + vmovn.i32 d3, q3 + + vst1.16 {q0, q1}, [r1] + + bx lr + + ENDP + +; r0 short *input +; r1 short *output +; r2 int pitch +|vp8_short_fdct8x4_neon| PROC + ; Store link register and input before calling + ; first 4x4 fdct. Do not need to worry about + ; output or pitch because those pointers are not + ; touched in the 4x4 fdct function + stmdb sp!, {r0, lr} + + bl vp8_short_fdct4x4_neon + + ldmia sp!, {r0, lr} + + ; Move to the next block of data. + add r0, r0, #8 + add r1, r1, #32 + + ; Second time through do not store off the + ; link register, just return from the 4x4 fdtc + b vp8_short_fdct4x4_neon + + ; Should never get to this. + bx lr + + ENDP + +;----------------- + AREA dct4x4_dat, DATA, READONLY +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_dct_matrix_ + DCD dct_matrix +dct_matrix +; DCW 23170, 30274, 23170, 12540 +; DCW 23170, 12540, -23170,-30274 +; DCW 23170, -12540, -23170, 30274 +; DCW 23170, -30274, 23170,-12540 +; 23170 = 0x5a82 +; -23170 = 0xa57e +; 30274 = 0x7642 +; -30274 = 0x89be +; 12540 = 0x30fc +; -12540 = 0xcf04 + DCD 0x76425a82, 0x30fc5a82 + DCD 0x30fc5a82, 0x89bea57e + DCD 0xcf045a82, 0x7642a57e + DCD 0x89be5a82, 0xcf045a82 + + END diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm new file mode 100644 index 000000000..8781ca0cc --- /dev/null +++ b/vp8/encoder/arm/neon/subtract_neon.asm @@ -0,0 +1,171 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_subtract_b_neon_func| + EXPORT |vp8_subtract_mby_neon| + EXPORT |vp8_subtract_mbuv_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;========================================= +;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch); +|vp8_subtract_b_neon_func| PROC + ldr r12, [sp] ;load pitch + + vld1.8 {d0}, [r1], r3 ;load src + vld1.8 {d1}, [r2], r12 ;load pred + vld1.8 {d2}, [r1], r3 + vld1.8 {d3}, [r2], r12 + vld1.8 {d4}, [r1], r3 + vld1.8 {d5}, [r2], r12 + vld1.8 {d6}, [r1], r3 + vld1.8 {d7}, [r2], r12 + + vsubl.u8 q10, d0, d1 + vsubl.u8 q11, d2, d3 + vsubl.u8 q12, d4, d5 + vsubl.u8 q13, d6, d7 + + mov r12, r12, lsl #1 + + vst1.16 {d20}, [r0], r12 ;store diff + vst1.16 {d22}, [r0], r12 + vst1.16 {d24}, [r0], r12 + vst1.16 {d26}, [r0], r12 + + bx lr + ENDP + +;========================================== +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride) +|vp8_subtract_mby_neon| PROC + mov r12, #4 + +subtract_mby_loop + vld1.8 {q0}, [r1], r3 ;load src + vld1.8 {q1}, [r2]! ;load pred + vld1.8 {q2}, [r1], r3 + vld1.8 {q3}, [r2]! + vld1.8 {q4}, [r1], r3 + vld1.8 {q5}, [r2]! + vld1.8 {q6}, [r1], r3 + vld1.8 {q7}, [r2]! + + vsubl.u8 q8, d0, d2 + vsubl.u8 q9, d1, d3 + vsubl.u8 q10, d4, d6 + vsubl.u8 q11, d5, d7 + vsubl.u8 q12, d8, d10 + vsubl.u8 q13, d9, d11 + vsubl.u8 q14, d12, d14 + vsubl.u8 q15, d13, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + subs r12, r12, #1 + bne subtract_mby_loop + + bx lr + ENDP + +;================================= +;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +|vp8_subtract_mbuv_neon| PROC + ldr r12, [sp] + +;u + add r0, r0, #512 ; short *udiff = diff + 256; + add r3, r3, #256 ; unsigned char *upred = pred + 256; + + vld1.8 {d0}, [r1], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r1], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r1], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r1], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r1], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r1], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r1], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r1], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + +;v + vld1.8 {d0}, [r2], r12 ;load src + vld1.8 {d1}, [r3]! ;load pred + vld1.8 {d2}, [r2], r12 + vld1.8 {d3}, [r3]! + vld1.8 {d4}, [r2], r12 + vld1.8 {d5}, [r3]! + vld1.8 {d6}, [r2], r12 + vld1.8 {d7}, [r3]! + vld1.8 {d8}, [r2], r12 + vld1.8 {d9}, [r3]! + vld1.8 {d10}, [r2], r12 + vld1.8 {d11}, [r3]! + vld1.8 {d12}, [r2], r12 + vld1.8 {d13}, [r3]! + vld1.8 {d14}, [r2], r12 + vld1.8 {d15}, [r3]! + + vsubl.u8 q8, d0, d1 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vsubl.u8 q11, d6, d7 + vsubl.u8 q12, d8, d9 + vsubl.u8 q13, d10, d11 + vsubl.u8 q14, d12, d13 + vsubl.u8 q15, d14, d15 + + vst1.16 {q8}, [r0]! ;store diff + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + + bx lr + ENDP + + END diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/encoder/arm/neon/variance_neon.asm new file mode 100644 index 000000000..64b83ca43 --- /dev/null +++ b/vp8/encoder/arm/neon/variance_neon.asm @@ -0,0 +1,275 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_variance16x16_neon| + EXPORT |vp8_variance16x8_neon| + EXPORT |vp8_variance8x16_neon| + EXPORT |vp8_variance8x8_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance16x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + ;VPADAL adds adjacent pairs of elements of a vector, and accumulates + ;the results into the elements of the destination vector. The explanation + ;in ARM guide is wrong. + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + ;vmov.32 r0, d0[0] ;this instruction costs a lot + ;vmov.32 r1, d1[0] + ;mul r0, r0, r0 + ;str r1, [r12] + ;sub r0, r1, r0, asr #8 + + ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should + ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right. + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================ +;unsigned int vp8_variance16x8_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) +|vp8_variance16x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #4 + +variance16x8_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance16x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================= +;unsigned int vp8_variance8x16_c( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *sse) + +|vp8_variance8x16_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #8 + +variance8x16_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d2, d6 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + + bne variance8x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #7 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + +;================================== +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance8x8_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +variance8x8_neon_loop + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 ;calculate diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;calculate sum + vmlal.s16 q9, d22, d22 ;calculate sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + ldr r12, [sp] ;load *sse from stack + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r12] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm new file mode 100644 index 000000000..f26b4d7ae --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_memcpy_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;========================================= +;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); +|vp8_memcpy_neon| PROC + ;pld [r1] ;preload pred data + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + mov r12, r2, lsr #8 ;copy 256 bytes data at one time + +memcpy_neon_loop + vld1.8 {q0, q1}, [r1]! ;load src data + subs r12, r12, #1 + vld1.8 {q2, q3}, [r1]! + vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr + vld1.8 {q4, q5}, [r1]! + vst1.8 {q2, q3}, [r0]! + vld1.8 {q6, q7}, [r1]! + vst1.8 {q4, q5}, [r0]! + vld1.8 {q8, q9}, [r1]! + vst1.8 {q6, q7}, [r0]! + vld1.8 {q10, q11}, [r1]! + vst1.8 {q8, q9}, [r0]! + vld1.8 {q12, q13}, [r1]! + vst1.8 {q10, q11}, [r0]! + vld1.8 {q14, q15}, [r1]! + vst1.8 {q12, q13}, [r0]! + vst1.8 {q14, q15}, [r0]! + + ;pld [r1] ;preload pred data -- need to adjust for real device + ;pld [r1, #128] + ;pld [r1, #256] + ;pld [r1, #384] + + bne memcpy_neon_loop + + ands r3, r2, #0xff ;extra copy + beq done_copy_neon_loop + +extra_copy_neon_loop + vld1.8 {q0}, [r1]! ;load src data + subs r3, r3, #16 + vst1.8 {q0}, [r0]! + bne extra_copy_neon_loop + +done_copy_neon_loop + bx lr + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm new file mode 100644 index 000000000..f53596727 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm @@ -0,0 +1,172 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_mse16x16_neon| + EXPORT |vp8_get16x16pred_error_neon| + EXPORT |vp8_get4x4sse_cs_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;============================ +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +;note: in this function, sum is never used. So, we can remove this part of calculation +;from vp8_variance(). + +|vp8_mse16x16_neon| PROC + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse + vmov.i8 q8, #0 + vmov.i8 q9, #0 + vmov.i8 q10, #0 + + mov r12, #8 + +mse16x16_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmlal.s16 q7, d22, d22 + vmlal.s16 q8, d23, d23 + + subs r12, r12, #1 + + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vmlal.s16 q7, d26, d26 + vmlal.s16 q8, d27, d27 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne mse16x16_neon_loop + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + + ldr r12, [sp] ;load *sse from stack + + vadd.u32 q10, q7, q9 + vpaddl.u32 q1, q10 + vadd.u64 d0, d2, d3 + + vst1.32 {d0[0]}, [r12] + vmov.32 r0, d0[0] + + bx lr + + ENDP + +;============================ +; r0 unsigned char *src_ptr +; r1 int src_stride +; r2 unsigned char *ref_ptr +; r3 int ref_stride +|vp8_get16x16pred_error_neon| PROC + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - pred_error + vmov.i8 q10, #0 + + mov r12, #8 + +get16x16pred_error_neon_loop + vld1.8 {q0}, [r0], r1 ;Load up source and reference + vld1.8 {q2}, [r2], r3 + vld1.8 {q1}, [r0], r1 + vld1.8 {q3}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 + vmlal.s16 q9, d22, d22 + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne get16x16pred_error_neon_loop + + vadd.u32 q10, q9, q10 + vpaddl.s32 q0, q8 + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] + bx lr + + ENDP + +;============================= +; r0 unsigned char *src_ptr, +; r1 int source_stride, +; r2 unsigned char *ref_ptr, +; r3 int recon_stride +|vp8_get4x4sse_cs_neon| PROC + vld1.8 {d0}, [r0], r1 ;Load up source and reference + vld1.8 {d4}, [r2], r3 + vld1.8 {d1}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d6}, [r2], r3 + vld1.8 {d3}, [r0], r1 + vld1.8 {d7}, [r2], r3 + + vsubl.u8 q11, d0, d4 + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vmull.s16 q7, d22, d22 + vmull.s16 q8, d24, d24 + vmull.s16 q9, d26, d26 + vmull.s16 q10, d28, d28 + + vadd.u32 q7, q7, q8 + vadd.u32 q9, q9, q10 + vadd.u32 q9, q7, q9 + + vpaddl.u32 q1, q9 + vadd.u64 d0, d2, d3 + + vmov.32 r0, d0[0] + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm new file mode 100644 index 000000000..9c52c52f6 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 vp8_writer *w +; r1 const TOKENEXTRA *p +; r2 int xcount +; r3 vp8_coef_encodings +; s0 vp8_extra_bits +; s1 vp8_coef_tree +|vp8cx_pack_tokens_armv7| PROC + push {r4-r11, lr} + + ; Add size of xcount * sizeof (TOKENEXTRA) to get stop + ; sizeof (TOKENEXTRA) is 20 + add r2, r2, r2, lsl #2 ; xcount + sub sp, sp, #12 + add r2, r1, r2, lsl #2 ; stop = p + xcount + str r2, [sp, #0] + str r3, [sp, #8] ; save vp8_coef_encodings + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + b check_p_lt_stop + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #8] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #52] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #52] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #48] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + add sp, sp, #12 + pop {r4-r11, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm new file mode 100644 index 000000000..92b098909 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm @@ -0,0 +1,335 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_mb_row_tokens_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 vp8_writer *w +; r2 vp8_coef_encodings +; r3 vp8_extra_bits +; s0 vp8_coef_tree + +|vp8cx_pack_mb_row_tokens_armv7| PROC + push {r4-r11, lr} + sub sp, sp, #24 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r2, [sp, #20] ; save vp8_coef_encodings + str r5, [sp, #12] ; save mb_rows + str r3, [sp, #8] ; save vp8_extra_bits + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + + mov r0, r1 ; keep same as other loops + + ldr r2, [r0, #vp8_writer_lowvalue] + ldr r5, [r0, #vp8_writer_range] + ldr r3, [r0, #vp8_writer_count] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actuall work gets done here! + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #20] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #60] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #60] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #8] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, #1 + add r7, r7, #TOKENLIST_SZ ; next element in the array + str r6, [sp, #12] + bne mb_row_loop + + str r2, [r0, #vp8_writer_lowvalue] + str r5, [r0, #vp8_writer_range] + str r3, [r0, #vp8_writer_count] + add sp, sp, #24 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist + + END diff --git a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm new file mode 100644 index 000000000..6d5f882ed --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm @@ -0,0 +1,471 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8cx_pack_tokens_into_partitions_armv7| + + INCLUDE vpx_vp8_enc_asm_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY + +; r0 VP8_COMP *cpi +; r1 unsigned char *cx_data +; r2 int num_part +; r3 *size +; s0 vp8_coef_encodings +; s1 vp8_extra_bits, +; s2 const vp8_tree_index *, + +|vp8cx_pack_tokens_into_partitions_armv7| PROC + push {r4-r11, lr} + sub sp, sp, #44 + + ; Compute address of cpi->common.mb_rows + ldr r4, _VP8_COMP_common_ + ldr r6, _VP8_COMMON_MBrows_ + add r4, r0, r4 + + ldr r5, [r4, r6] ; load up mb_rows + + str r5, [sp, #36] ; save mb_rows + str r1, [sp, #24] ; save cx_data + str r2, [sp, #20] ; save num_part + str r3, [sp, #8] ; save *size + + ; *size = 3*(num_part -1 ); + sub r2, r2, #1 ; num_part - 1 + add r2, r2, r2, lsl #1 ; 3*(num_part - 1) + str r2, [r3] + + add r2, r2, r1 ; cx_data + *size + str r2, [sp, #40] ; ptr + + ldr r4, _VP8_COMP_tplist_ + add r4, r0, r4 + ldr r7, [r4, #0] ; dereference cpi->tp_list + str r7, [sp, #32] ; store start of cpi->tp_list + + ldr r11, _VP8_COMP_bc2_ ; load up vp8_writer out of cpi + add r0, r0, r11 + + mov r11, #0 + str r11, [sp, #28] ; i + +numparts_loop + ldr r10, [sp, #40] ; ptr + ldr r5, [sp, #36] ; move mb_rows to the counting section + str r5, [sp, #12] + + ; Reset all of the VP8 Writer data for each partition that + ; is processed. + ; start_encode + mov r2, #0 ; vp8_writer_lowvalue + mov r5, #255 ; vp8_writer_range + mvn r3, #23 ; vp8_writer_count + + str r2, [r0, #vp8_writer_value] + str r2, [r0, #vp8_writer_pos] + str r10, [r0, #vp8_writer_buffer] + +mb_row_loop + + ldr r1, [r7, #tokenlist_start] + ldr r9, [r7, #tokenlist_stop] + str r9, [sp, #0] ; save stop for later comparison + str r7, [sp, #16] ; tokenlist address for next time + + b check_p_lt_stop + + ; actual work gets done here! + +while_p_lt_stop + ldr r6, [r1, #tokenextra_token] ; t + ldr r4, [sp, #80] ; vp8_coef_encodings + mov lr, #0 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t + ldr r9, [r1, #tokenextra_context_tree] ; pp + + ldr r7, [r1, #tokenextra_skip_eob_node] + + ldr r6, [r4, #vp8_token_value] ; v + ldr r8, [r4, #vp8_token_len] ; n + + ; vp8 specific skip_eob_node + cmp r7, #0 + movne lr, #2 ; i = 2 + subne r8, r8, #1 ; --n + + ; reverse the stream of bits to be packed. Normally + ; the most significant bit is peeled off and compared + ; in the form of (v >> --n) & 1. ARM architecture has + ; the ability to set a flag based on the value of the + ; bit shifted off the bottom of the register. To make + ; that happen the bitstream is reversed. + rbit r12, r6 + rsb r4, r8, #32 ; 32-n + ldr r10, [sp, #88] ; vp8_coef_tree + + ; v is kept in r12 during the token pack loop + lsr r12, r12, r4 ; v >>= 32 - n + +; loop start +token_loop + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] + sub r7, r5, #1 ; range-1 + + ; Decisions are made based on the bit value shifted + ; off of v, so set a flag here based on this. + ; This value is refered to as "bb" + lsrs r12, r12, #1 ; bb = v >> n + mul r4, r4, r7 ; ((range-1) * pp[i>>1])) + + ; bb can only be 0 or 1. So only execute this statement + ; if bb == 1, otherwise it will act like i + 0 + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start +token_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + + ; r10 is used earlier in the loop, but r10 is used as + ; temp variable here. So after r10 is used, reload + ; vp8_coef_tree_dcd into r10 + ldr r10, [sp, #88] ; vp8_coef_tree + +token_count_lt_zero + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r8, r8, #1 ; --n + bne token_loop + + ldr r6, [r1, #tokenextra_token] ; t + ldr r7, [sp, #84] ; vp8_extra_bits + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired + ; element. Here vp8_extra_bit_struct == 20 + add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t + add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t + + ldr r4, [r12, #vp8_extra_bit_struct_base_val] + cmp r4, #0 + beq skip_extra_bits + +; if( b->base_val) + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L + ldr lr, [r1, #tokenextra_extra] ; e = p->Extra + cmp r8, #0 ; if( L) + beq no_extra_bits + + ldr r9, [r12, #vp8_extra_bit_struct_prob] + asr r7, lr, #1 ; v=e>>1 + + ldr r10, [r12, #vp8_extra_bit_struct_tree] + str r10, [sp, #4] ; b->tree + + rbit r12, r7 ; reverse v + rsb r4, r8, #32 + lsr r12, r12, r4 + + mov lr, #0 ; i = 0 + +extra_bits_loop + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] + sub r7, r5, #1 ; range-1 + lsrs r12, r12, #1 ; v >> n + mul r4, r4, r7 ; (range-1) * pp[i>>1] + addcs lr, lr, #1 ; i + bb + + mov r7, #1 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] + add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) + + addcs r2, r2, r4 ; if (bb) lowvalue += split + subcs r4, r5, r4 ; if (bb) range = range-split + + clz r6, r4 + sub r6, r6, #24 + + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi extra_count_lt_zero ; if(count >= 0) + + sub r6, r6, r3 ; offset= shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl extra_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos - 1 + b extra_zero_while_start +extra_zero_while_loop + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +extra_zero_while_start + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq extra_zero_while_loop + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] + add r10, r10, #1 + strb r10, [r7, r4] +extra_high_bit_not_set + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) + ldr r10, [sp, #4] ; b->tree +extra_count_lt_zero + lsl r2, r2, r6 + + subs r8, r8, #1 ; --n + bne extra_bits_loop ; while (n) + +no_extra_bits + ldr lr, [r1, #4] ; e = p->Extra + add r4, r5, #1 ; range + 1 + tst lr, #1 + lsr r4, r4, #1 ; split = (range + 1) >> 1 + addne r2, r2, r4 ; lowvalue += split + subne r4, r5, r4 ; range = range-split + tst r2, #0x80000000 ; lowvalue & 0x80000000 + lsl r5, r4, #1 ; range <<= 1 + beq end_high_bit_not_set + + ldr r4, [r0, #vp8_writer_pos] + mov r7, #0 + sub r4, r4, #1 + b end_zero_while_start +end_zero_while_loop + strb r7, [r6, r4] + sub r4, r4, #1 ; x-- +end_zero_while_start + cmp r4, #0 + ldrge r6, [r0, #vp8_writer_buffer] + ldrb r12, [r6, r4] + cmpge r12, #0xff + beq end_zero_while_loop + + ldr r6, [r0, #vp8_writer_buffer] + ldrb r7, [r6, r4] + add r7, r7, #1 + strb r7, [r6, r4] +end_high_bit_not_set + adds r3, r3, #1 ; ++count + lsl r2, r2, #1 ; lowvalue <<= 1 + bne end_count_zero + + ldr r4, [r0, #vp8_writer_pos] + mvn r3, #7 + ldr r7, [r0, #vp8_writer_buffer] + lsr r6, r2, #24 ; lowvalue >> 24 + add r12, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r12, [r0, #0x10] + strb r6, [r7, r4] +end_count_zero +skip_extra_bits + add r1, r1, #TOKENEXTRA_SZ ; ++p +check_p_lt_stop + ldr r4, [sp, #0] ; stop + cmp r1, r4 ; while( p < stop) + bcc while_p_lt_stop + + ldr r10, [sp, #20] ; num_parts + mov r1, #TOKENLIST_SZ + mul r1, r10, r1 + + ldr r6, [sp, #12] ; mb_rows + ldr r7, [sp, #16] ; tokenlist address + subs r6, r6, r10 + add r7, r7, r1 ; next element in the array + str r6, [sp, #12] + bgt mb_row_loop + + mov r12, #32 + +stop_encode_loop + sub r7, r5, #1 ; range-1 + + mov r4, r7, lsl #7 ; ((range-1) * 128) + + mov r7, #1 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) + + ; Counting the leading zeros is used to normalize range. + clz r6, r4 + sub r6, r6, #24 ; shift + + ; Flag is set on the sum of count. This flag is used later + ; to determine if count >= 0 + adds r3, r3, r6 ; count += shift + lsl r5, r4, r6 ; range <<= shift + bmi token_count_lt_zero_se ; if(count >= 0) + + sub r6, r6, r3 ; offset = shift - count + sub r4, r6, #1 ; offset-1 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) + bpl token_high_bit_not_set_se + + ldr r4, [r0, #vp8_writer_pos] ; x + sub r4, r4, #1 ; x = w->pos-1 + b token_zero_while_start_se +token_zero_while_loop_se + mov r10, #0 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 + sub r4, r4, #1 ; x-- +token_zero_while_start_se + cmp r4, #0 + ldrge r7, [r0, #vp8_writer_buffer] + ldrb r11, [r7, r4] + cmpge r11, #0xff + beq token_zero_while_loop_se + + ldr r7, [r0, #vp8_writer_buffer] + ldrb r10, [r7, r4] ; w->buffer[x] + add r10, r10, #1 + strb r10, [r7, r4] ; w->buffer[x] + 1 +token_high_bit_not_set_se + rsb r4, r6, #24 ; 24-offset + ldr r10, [r0, #vp8_writer_buffer] + lsr r7, r2, r4 ; lowvalue >> (24-offset) + ldr r4, [r0, #vp8_writer_pos] ; w->pos + lsl r2, r2, r6 ; lowvalue <<= offset + mov r6, r3 ; shift = count + add r11, r4, #1 ; w->pos++ + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff + str r11, [r0, #vp8_writer_pos] + sub r3, r3, #8 ; count -= 8 + strb r7, [r10, r4] ; w->buffer[w->pos++] + +token_count_lt_zero_se + lsl r2, r2, r6 ; lowvalue <<= shift + + subs r12, r12, #1 + bne stop_encode_loop + + ldr r10, [sp, #8] ; *size + ldr r11, [r10] + ldr r4, [r0, #vp8_writer_pos] ; w->pos + add r11, r11, r4 ; *size += w->pos + str r11, [r10] + + ldr r9, [sp, #20] ; num_parts + sub r9, r9, #1 + ldr r10, [sp, #28] ; i + cmp r10, r9 ; if(i<(num_part - 1)) + bge skip_write_partition + + ldr r12, [sp, #40] ; ptr + add r12, r12, r4 ; ptr += w->pos + str r12, [sp, #40] + + ldr r9, [sp, #24] ; cx_data + mov r8, r4, asr #8 + strb r4, [r9, #0] + strb r8, [r9, #1] + mov r4, r4, asr #16 + strb r4, [r9, #2] + + add r9, r9, #3 ; cx_data += 3 + str r9, [sp, #24] + +skip_write_partition + + ldr r11, [sp, #28] ; i + ldr r10, [sp, #20] ; num_parts + + add r11, r11, #1 ; i++ + str r11, [sp, #28] + + ldr r7, [sp, #32] ; cpi->tp_list[i] + mov r1, #TOKENLIST_SZ + add r7, r7, r1 ; next element in cpi->tp_list + str r7, [sp, #32] ; cpi->tp_list[i+1] + + cmp r10, r11 + bgt numparts_loop + + + add sp, sp, #44 + pop {r4-r11, pc} + ENDP + +_VP8_COMP_common_ + DCD vp8_comp_common +_VP8_COMMON_MBrows_ + DCD vp8_common_mb_rows +_VP8_COMP_tplist_ + DCD vp8_comp_tplist +_VP8_COMP_bc2_ + DCD vp8_comp_bc2 + + END diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm new file mode 100644 index 000000000..5269c0af8 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm @@ -0,0 +1,75 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_short_walsh4x4_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_short_walsh4x4_c(short *input, short *output, int pitch) + +|vp8_short_walsh4x4_neon| PROC + vld1.16 {d2}, [r0], r2 ;load input + vld1.16 {d3}, [r0], r2 + vld1.16 {d4}, [r0], r2 + vld1.16 {d5}, [r0], r2 + + ;First for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[3] + vadd.s16 d7, d3, d4 ;b1 = ip[1]+ip[2] + vsub.s16 d8, d3, d4 ;c1 = ip[1]-ip[2] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[3] + + vadd.s16 d2, d6, d7 ;op[0] = a1 + b1 + vsub.s16 d4, d6, d7 ;op[2] = a1 - b1 + vadd.s16 d3, d8, d9 ;op[1] = c1 + d1 + vsub.s16 d5, d9, d8 ;op[3] = d1 - c1 + + ;Second for-loop + ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] + vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] + vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] + vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] + + vadd.s16 d2, d6, d7 ;a2 = a1 + b1; + vsub.s16 d4, d6, d7 ;c2 = a1 - b1; + vadd.s16 d3, d8, d9 ;b2 = c1 + d1; + vsub.s16 d5, d9, d8 ;d2 = d1 - c1; + + vcgt.s16 q3, q1, #0 + vcgt.s16 q4, q2, #0 + + vsub.s16 q1, q1, q3 + vsub.s16 q2, q2, q4 + + vshr.s16 q1, q1, #1 + vshr.s16 q2, q2, #1 + + vst1.16 {q1, q2}, [r1] + + bx lr + + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm new file mode 100644 index 000000000..aec716e3b --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -0,0 +1,427 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. + +|vp8_sub_pixel_variance16x16_neon| PROC + push {r4-r6, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #16] ;load *dst_ptr from stack + ldr r5, [sp, #20] ;load dst_pixels_per_line from stack + ldr r6, [sp, #24] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne vp8e_filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + sub sp, sp, #256 + mov r3, sp + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +vp8e_filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5}, [r3]! + vst1.u8 {d6, d7}, [r3]! + vmov q11, q15 + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_sp16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + sub sp, sp, #528 ;reserve space on stack for temporary storage + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r3]! ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r3]! + vst1.u8 {d18, d19}, [r3]! + vst1.u8 {d20, d21}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + sub sp, sp, #528 ;reserve space on stack for temporary storage + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + mov r3, sp + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +vp8e_filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r3]! ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r3]! + vmov q11, q15 + vst1.u8 {d6, d7}, [r3]! + vst1.u8 {d8, d9}, [r3]! + + bne vp8e_filt_blk2d_spo16x16_loop_neon + + b sub_pixel_variance16x16_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r12, #8 + +sub_pixel_variance16x16_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q2}, [r4], r5 + vld1.8 {q1}, [r3]! + vld1.8 {q3}, [r4], r5 + + vsubl.u8 q11, d0, d4 ;diff + vsubl.u8 q12, d1, d5 + vsubl.u8 q13, d2, d6 + vsubl.u8 q14, d3, d7 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + bne sub_pixel_variance16x16_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [r6] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #528 + vmov.32 r0, d0[0] ;return + + pop {r4-r6,pc} + + ENDP + +;----------------- + AREA vp8e_bilinear_taps_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm new file mode 100644 index 000000000..3d02d7c40 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -0,0 +1,571 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance16x16s_4_0_neon| + EXPORT |vp8_sub_pixel_variance16x16s_0_4_neon| + EXPORT |vp8_sub_pixel_variance16x16s_4_4_neon| + EXPORT |vp8_sub_pixel_variance16x16s_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_4_0_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +;First Pass: output_height lines x output_width columns (16x16) +vp8_filt_fpo16x16s_4_0_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.8 {q11}, [r2], r3 + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.8 {q12}, [r2], r3 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.8 {q13}, [r2], r3 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vext.8 q3, q2, q3, #1 + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vld1.8 {q14}, [r2], r3 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + + vsubl.u8 q4, d0, d22 ;diff + vsubl.u8 q5, d1, d23 + vsubl.u8 q6, d2, d24 + vsubl.u8 q7, d3, d25 + vsubl.u8 q0, d4, d26 + vsubl.u8 q1, d5, d27 + vsubl.u8 q2, d6, d28 + vsubl.u8 q3, d7, d29 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + subs r12, r12, #1 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_fpo16x16s_4_0_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_0_4_neon| PROC + push {lr} + + mov r12, #4 ;loop counter + + vld1.u8 {q0}, [r0], r1 ;load src data + ldr lr, [sp, #4] ;load *sse from stack + + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + +vp8_filt_spo16x16s_0_4_loop_neon + vld1.u8 {q2}, [r0], r1 + vld1.8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vld1.8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vld1.8 {q5}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + + vrhadd.u8 q0, q0, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q4, q4, q6 + vrhadd.u8 q6, q6, q15 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r12, r12, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + + vmov q0, q15 + + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne vp8_filt_spo16x16s_0_4_loop_neon + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;================================================ +;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon +;( +; unsigned char *src_ptr, r0 +; int src_pixels_per_line, r1 +; unsigned char *dst_ptr, r2 +; int dst_pixels_per_line, r3 +; unsigned int *sse +;); +;================================================ +|vp8_sub_pixel_variance16x16s_4_4_neon| PROC + push {lr} + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + + ldr lr, [sp, #4] ;load *sse from stack + vmov.i8 q13, #0 ;q8 - sum + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + + vmov.i8 q14, #0 ;q9, q10 - sse + vmov.i8 q15, #0 + + mov r12, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8_filt16x16s_4_4_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vld1.8 {q5}, [r2], r3 + vrhadd.u8 q0, q0, q1 + vld1.8 {q6}, [r2], r3 + vrhadd.u8 q1, q1, q2 + vld1.8 {q7}, [r2], r3 + vrhadd.u8 q2, q2, q3 + vld1.8 {q8}, [r2], r3 + vrhadd.u8 q3, q3, q4 + + vsubl.u8 q9, d0, d10 ;diff + vsubl.u8 q10, d1, d11 + vsubl.u8 q11, d2, d12 + vsubl.u8 q12, d3, d13 + + vsubl.u8 q0, d4, d14 ;diff + vsubl.u8 q1, d5, d15 + vsubl.u8 q5, d6, d16 + vsubl.u8 q6, d7, d17 + + vpadal.s16 q13, q9 ;sum + vmlal.s16 q14, d18, d18 ;sse + vmlal.s16 q15, d19, d19 + + vpadal.s16 q13, q10 ;sum + vmlal.s16 q14, d20, d20 ;sse + vmlal.s16 q15, d21, d21 + + vpadal.s16 q13, q11 ;sum + vmlal.s16 q14, d22, d22 ;sse + vmlal.s16 q15, d23, d23 + + vpadal.s16 q13, q12 ;sum + vmlal.s16 q14, d24, d24 ;sse + vmlal.s16 q15, d25, d25 + + subs r12, r12, #1 + + vpadal.s16 q13, q0 ;sum + vmlal.s16 q14, d0, d0 ;sse + vmlal.s16 q15, d1, d1 + + vpadal.s16 q13, q1 ;sum + vmlal.s16 q14, d2, d2 ;sse + vmlal.s16 q15, d3, d3 + + vpadal.s16 q13, q5 ;sum + vmlal.s16 q14, d10, d10 ;sse + vmlal.s16 q15, d11, d11 + + vmov q0, q4 + + vpadal.s16 q13, q6 ;sum + vmlal.s16 q14, d12, d12 ;sse + vmlal.s16 q15, d13, d13 + + bne vp8_filt16x16s_4_4_loop_neon + + vadd.u32 q15, q14, q15 ;accumulate sse + vpaddl.s32 q0, q13 ;accumulate sum + + vpaddl.u32 q1, q15 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {pc} + ENDP + +;============================== +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pixels_per_line, +; stack unsigned int *sse +;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step() +;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter, +;or filter coeff is {64, 64}. This simplified program only works in this situation. +;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later. + +|vp8_sub_pixel_variance16x16s_neon| PROC + push {r4, lr} + + ldr r4, [sp, #8] ;load *dst_ptr from stack + ldr r12, [sp, #12] ;load dst_pixels_per_line from stack + ldr lr, [sp, #16] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16s_only + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq firstpass_bfilter16x16s_only + + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + sub sp, sp, #256 ;reserve space on stack for temporary storage + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + mov r3, sp + mov r2, #4 ;loop counter + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + +;First Pass: output_height lines x output_width columns (17x16) +vp8e_filt_blk2d_fp16x16s_loop_neon + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q3, q2, q3, #1 ;construct src_ptr[1] + vext.8 q5, q4, q5, #1 + vext.8 q7, q6, q7, #1 + vext.8 q9, q8, q9, #1 + + vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + + vrhadd.u8 q0, q0, q1 + vrhadd.u8 q1, q1, q2 + vrhadd.u8 q2, q2, q3 + vrhadd.u8 q3, q3, q4 + + subs r2, r2, #1 + vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result + vmov q0, q4 + vst1.u8 {d4, d5, d6, d7}, [r3]! + + bne vp8e_filt_blk2d_fp16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;-------------------- +firstpass_bfilter16x16s_only + mov r2, #2 ;loop counter + sub sp, sp, #256 ;reserve space on stack for temporary storage + mov r3, sp + +;First Pass: output_height lines x output_width columns (16x16) +vp8e_filt_blk2d_fpo16x16s_loop_neon + vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data + vld1.u8 {d4, d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14, d15}, [r0], r1 + + ;pld [r0] + ;pld [r0, r1] + ;pld [r0, r1, lsl #1] + + vext.8 q1, q0, q1, #1 ;construct src_ptr[1] + vld1.u8 {d16, d17, d18, d19}, [r0], r1 + vext.8 q3, q2, q3, #1 + vld1.u8 {d20, d21, d22, d23}, [r0], r1 + vext.8 q5, q4, q5, #1 + vld1.u8 {d24, d25, d26, d27}, [r0], r1 + vext.8 q7, q6, q7, #1 + vld1.u8 {d28, d29, d30, d31}, [r0], r1 + vext.8 q9, q8, q9, #1 + vext.8 q11, q10, q11, #1 + vext.8 q13, q12, q13, #1 + vext.8 q15, q14, q15, #1 + + vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 + vrhadd.u8 q1, q2, q3 + vrhadd.u8 q2, q4, q5 + vrhadd.u8 q3, q6, q7 + vrhadd.u8 q4, q8, q9 + vrhadd.u8 q5, q10, q11 + vrhadd.u8 q6, q12, q13 + vrhadd.u8 q7, q14, q15 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_fpo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;--------------------- +secondpass_bfilter16x16s_only + sub sp, sp, #256 ;reserve space on stack for temporary storage + + mov r2, #2 ;loop counter + vld1.u8 {d0, d1}, [r0], r1 ;load src data + mov r3, sp + +vp8e_filt_blk2d_spo16x16s_loop_neon + vld1.u8 {d2, d3}, [r0], r1 + vld1.u8 {d4, d5}, [r0], r1 + vld1.u8 {d6, d7}, [r0], r1 + vld1.u8 {d8, d9}, [r0], r1 + + vrhadd.u8 q0, q0, q1 + vld1.u8 {d10, d11}, [r0], r1 + vrhadd.u8 q1, q1, q2 + vld1.u8 {d12, d13}, [r0], r1 + vrhadd.u8 q2, q2, q3 + vld1.u8 {d14, d15}, [r0], r1 + vrhadd.u8 q3, q3, q4 + vld1.u8 {d16, d17}, [r0], r1 + vrhadd.u8 q4, q4, q5 + vrhadd.u8 q5, q5, q6 + vrhadd.u8 q6, q6, q7 + vrhadd.u8 q7, q7, q8 + + subs r2, r2, #1 + + vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result + vmov q0, q8 + vst1.u8 {d4, d5, d6, d7}, [r3]! + vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result + vst1.u8 {d12, d13, d14, d15}, [r3]! + + bne vp8e_filt_blk2d_spo16x16s_loop_neon + + b sub_pixel_variance16x16s_neon + +;---------------------------- +;variance16x16 +sub_pixel_variance16x16s_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + sub r3, r3, #256 + mov r2, #4 + +sub_pixel_variance16x16s_neon_loop + vld1.8 {q0}, [r3]! ;Load up source and reference + vld1.8 {q1}, [r4], r12 + vld1.8 {q2}, [r3]! + vld1.8 {q3}, [r4], r12 + vld1.8 {q4}, [r3]! + vld1.8 {q5}, [r4], r12 + vld1.8 {q6}, [r3]! + vld1.8 {q7}, [r4], r12 + + vsubl.u8 q11, d0, d2 ;diff + vsubl.u8 q12, d1, d3 + vsubl.u8 q13, d4, d6 + vsubl.u8 q14, d5, d7 + vsubl.u8 q0, d8, d10 + vsubl.u8 q1, d9, d11 + vsubl.u8 q2, d12, d14 + vsubl.u8 q3, d13, d15 + + vpadal.s16 q8, q11 ;sum + vmlal.s16 q9, d22, d22 ;sse + vmlal.s16 q10, d23, d23 + + subs r2, r2, #1 + + vpadal.s16 q8, q12 + vmlal.s16 q9, d24, d24 + vmlal.s16 q10, d25, d25 + vpadal.s16 q8, q13 + vmlal.s16 q9, d26, d26 + vmlal.s16 q10, d27, d27 + vpadal.s16 q8, q14 + vmlal.s16 q9, d28, d28 + vmlal.s16 q10, d29, d29 + + vpadal.s16 q8, q0 ;sum + vmlal.s16 q9, d0, d0 ;sse + vmlal.s16 q10, d1, d1 + vpadal.s16 q8, q1 + vmlal.s16 q9, d2, d2 + vmlal.s16 q10, d3, d3 + vpadal.s16 q8, q2 + vmlal.s16 q9, d4, d4 + vmlal.s16 q10, d5, d5 + vpadal.s16 q8, q3 + vmlal.s16 q9, d6, d6 + vmlal.s16 q10, d7, d7 + + bne sub_pixel_variance16x16s_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #8 + vsub.s32 d0, d1, d10 + + add sp, sp, #256 + vmov.32 r0, d0[0] ;return + + pop {r4, pc} + ENDP + + END diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm new file mode 100644 index 000000000..bd56761fa --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -0,0 +1,226 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_sub_pixel_variance8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pixels_per_line, +; stack(r6) unsigned int *sse +;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon. + +|vp8_sub_pixel_variance8x8_neon| PROC + push {r4-r5, lr} + + ldr r12, _BilinearTaps_coeff_ + ldr r4, [sp, #12] ;load *dst_ptr from stack + ldr r5, [sp, #16] ;load dst_pixels_per_line from stack + ldr lr, [sp, #20] ;load *sse from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + ;skip_secondpass_filter + beq sub_pixel_variance8x8_neon + + add r3, r12, r3, lsl #3 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q2, #7 + vqrshrn.u16 d24, q3, #7 + vqrshrn.u16 d25, q4, #7 + vqrshrn.u16 d26, q5, #7 + vqrshrn.u16 d27, q6, #7 + vqrshrn.u16 d28, q7, #7 + vqrshrn.u16 d29, q8, #7 + + b sub_pixel_variance8x8_neon + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;---------------------- +;vp8_variance8x8_neon +sub_pixel_variance8x8_neon + vmov.i8 q8, #0 ;q8 - sum + vmov.i8 q9, #0 ;q9, q10 - sse + vmov.i8 q10, #0 + + mov r12, #2 + +sub_pixel_variance8x8_neon_loop + vld1.8 {d0}, [r4], r5 ;load dst data + subs r12, r12, #1 + vld1.8 {d1}, [r4], r5 + vld1.8 {d2}, [r4], r5 + vsubl.u8 q4, d22, d0 ;calculate diff + vld1.8 {d3}, [r4], r5 + + vsubl.u8 q5, d23, d1 + vsubl.u8 q6, d24, d2 + + vpadal.s16 q8, q4 ;sum + vmlal.s16 q9, d8, d8 ;sse + vmlal.s16 q10, d9, d9 + + vsubl.u8 q7, d25, d3 + + vpadal.s16 q8, q5 + vmlal.s16 q9, d10, d10 + vmlal.s16 q10, d11, d11 + + vmov q11, q13 + + vpadal.s16 q8, q6 + vmlal.s16 q9, d12, d12 + vmlal.s16 q10, d13, d13 + + vmov q12, q14 + + vpadal.s16 q8, q7 + vmlal.s16 q9, d14, d14 + vmlal.s16 q10, d15, d15 + + bne sub_pixel_variance8x8_neon_loop + + vadd.u32 q10, q9, q10 ;accumulate sse + vpaddl.s32 q0, q8 ;accumulate sum + + vpaddl.u32 q1, q10 + vadd.s64 d0, d0, d1 + vadd.u64 d1, d2, d3 + + vmull.s32 q5, d0, d0 + vst1.32 {d1[0]}, [lr] ;store sse + vshr.s32 d10, d10, #6 + vsub.s32 d0, d1, d10 + + vmov.32 r0, d0[0] ;return + pop {r4-r5, pc} + + ENDP + +;----------------- + AREA bilinear_taps_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_BilinearTaps_coeff_ + DCD bilinear_taps_coeff +bilinear_taps_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp8/encoder/arm/picklpf_arm.c b/vp8/encoder/arm/picklpf_arm.c new file mode 100644 index 000000000..0586e55d8 --- /dev/null +++ b/vp8/encoder/arm/picklpf_arm.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "onyxc_int.h" +#include "onyx_int.h" +#include "quantize.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12extend.h" +#include "vpx_scale/vpxscale.h" +#include "alloccommon.h" + +extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); + + +void +vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) +{ + unsigned char *src_y, *dst_y; + int yheight; + int ystride; + int border; + int yoffset; + int linestocopy; + + border = src_ybc->border; + yheight = src_ybc->y_height; + ystride = src_ybc->y_stride; + + linestocopy = (yheight >> (Fraction + 4)); + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + yoffset = ystride * ((yheight >> 5) * 16 - 8); + src_y = src_ybc->y_buffer + yoffset; + dst_y = dst_ybc->y_buffer + yoffset; + + //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16)); + vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16))); +} diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c new file mode 100644 index 000000000..46906d3a2 --- /dev/null +++ b/vp8/encoder/arm/quantize_arm.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include "vpx_mem/vpx_mem.h" + +#include "quantize.h" +#include "entropy.h" +#include "predictdc.h" + +DECLARE_ALIGNED(16, const short, vp8_rvsplus1_default_zig_zag1d[16]) = +{ + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16, +}; + + +extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dqcoeff_ptr, short *dequant_ptr, const short *scan_mask, short *round_ptr, short *quant_ptr); + +void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) +{ + d->eob = vp8_fast_quantize_b_neon_func(b->coeff, &b->zbin[0][0], d->qcoeff, d->dqcoeff, d->dequant[0], vp8_rvsplus1_default_zig_zag1d, &b->round[0][0], &b->quant[0][0]); +} + +/* +//neon code is written according to the following rewritten c code +void vp8_fast_quantize_b_neon(BLOCK *b,BLOCKD *d) +{ + int i, rc, eob; + int zbin; + int x, x1, y, z, sz; + short *coeff_ptr = &b->Coeff[0]; + short *zbin_ptr = &b->Zbin[0][0]; + short *round_ptr = &b->Round[0][0]; + short *quant_ptr = &b->Quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr= d->dqcoeff; + short *dequant_ptr= &d->Dequant[0][0]; + + eob = 0; + + for(i=0;i<16;i++) + { + z = coeff_ptr[i]; + zbin = zbin_ptr[i] ; + x = abs(z); // x = abs(z) + + if(x>=zbin) + { + sz = (z>>31); // sign of z + y = ((x+round_ptr[i])*quant_ptr[i])>>16; // quantize (x) + x1 = (y^sz) - sz; // get the sign back + + qcoeff_ptr[i] = x1; // write to destination + dqcoeff_ptr[i] = x1 * dequant_ptr[i]; // dequantized value + + if(y) + { + if(eob<vp8_rvsplus1_default_zig_zag1d[i]) + eob=(int)vp8_rvsplus1_default_zig_zag1d[i]; // last nonzero coeffs + } + }else + { + qcoeff_ptr[i] = 0; // write to destination + dqcoeff_ptr[i] = 0; // dequantized value + } + } + d->eob = eob; +} +*/ diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h new file mode 100644 index 000000000..e93f0fef1 --- /dev/null +++ b/vp8/encoder/arm/quantize_arm.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef QUANTIZE_ARM_H +#define QUANTIZE_ARM_H + +#if HAVE_ARMV7 +extern prototype_quantize_block(vp8_fast_quantize_b_neon); + +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon + +#endif + +#endif diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h new file mode 100644 index 000000000..d9fc9b3e0 --- /dev/null +++ b/vp8/encoder/arm/variance_arm.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VARIANCE_ARM_H +#define VARIANCE_ARM_H + +#if HAVE_ARMV7 +extern prototype_sad(vp8_sad4x4_neon); +extern prototype_sad(vp8_sad8x8_neon); +extern prototype_sad(vp8_sad8x16_neon); +extern prototype_sad(vp8_sad16x8_neon); +extern prototype_sad(vp8_sad16x16_neon); + +//extern prototype_variance(vp8_variance4x4_c); +extern prototype_variance(vp8_variance8x8_neon); +extern prototype_variance(vp8_variance8x16_neon); +extern prototype_variance(vp8_variance16x8_neon); +extern prototype_variance(vp8_variance16x16_neon); + +//extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_c); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon); +//extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c); +//extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon); + +//extern prototype_getmbss(vp8_get_mb_ss_c); +extern prototype_variance(vp8_mse16x16_neon); +extern prototype_sad(vp8_get16x16pred_error_neon); +//extern prototype_variance2(vp8_get8x8var_c); +//extern prototype_variance2(vp8_get16x16var_c); +extern prototype_sad(vp8_get4x4sse_cs_neon); + +#undef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_neon + +#undef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_neon + +#undef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_neon + +#undef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_neon + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_neon + +//#undef vp8_variance_var4x4 +//#define vp8_variance_var4x4 vp8_variance4x4_c + +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_neon + +#undef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_neon + +#undef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_neon + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_neon + +//#undef vp8_variance_subpixvar4x4 +//#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c + +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_neon + +//#undef vp8_variance_subpixvar8x16 +//#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c + +//#undef vp8_variance_subpixvar16x8 +//#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon + +//#undef vp8_variance_getmbss +//#define vp8_variance_getmbss vp8_get_mb_ss_c + +#undef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_neon + +#undef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon + +//#undef vp8_variance_get8x8var +//#define vp8_variance_get8x8var vp8_get8x8var_c + +//#undef vp8_variance_get16x16var +//#define vp8_variance_get16x16var vp8_get16x16var_c + +#undef vp8_variance_get4x4sse_cs +#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon + +#endif + +#endif diff --git a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c new file mode 100644 index 000000000..8cdf0791f --- /dev/null +++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include <stddef.h> + +#include "../treewriter.h" +#include "../tokenize.h" +#include "../onyx_int.h" + +#define ct_assert(name,cond) \ + static void assert_##name(void) UNUSED;\ + static void assert_##name(void) {switch(0){case 0:case !!(cond):;}} + +#define DEFINE(sym, val) int sym = val; + +/* +#define BLANK() asm volatile("\n->" : : ) +*/ + +/* + * int main(void) + * { + */ + +DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue)); +DEFINE(vp8_writer_range, offsetof(vp8_writer, range)); +DEFINE(vp8_writer_value, offsetof(vp8_writer, value)); +DEFINE(vp8_writer_count, offsetof(vp8_writer, count)); +DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos)); +DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer)); + +DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token)); +DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra)); +DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree)); +DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node)); +DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA)); + +DEFINE(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct)); + +DEFINE(vp8_token_value, offsetof(vp8_token, value)); +DEFINE(vp8_token_len, offsetof(vp8_token, Len)); + +DEFINE(vp8_extra_bit_struct_tree, offsetof(vp8_extra_bit_struct, tree)); +DEFINE(vp8_extra_bit_struct_prob, offsetof(vp8_extra_bit_struct, prob)); +DEFINE(vp8_extra_bit_struct_prob_bc, offsetof(vp8_extra_bit_struct, prob_bc)); +DEFINE(vp8_extra_bit_struct_len, offsetof(vp8_extra_bit_struct, Len)); +DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, base_val)); + +DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist)); +DEFINE(vp8_comp_common, offsetof(VP8_COMP, common)); +DEFINE(vp8_comp_bc2, offsetof(VP8_COMP, bc2)); + +DEFINE(tokenlist_start, offsetof(TOKENLIST, start)); +DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop)); +DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST)); + +DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows)); + +// These two sizes are used in vp7cx_pack_tokens. They are hard coded +// so if the size changes this will have to be adjusted. +ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20) +ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20) + +//add asserts for any offset that is not supported by assembly code +//add asserts for any size that is not supported by assembly code +/* + * return 0; + * } + */ diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c new file mode 100644 index 000000000..31ad56a2a --- /dev/null +++ b/vp8/encoder/bitstream.c @@ -0,0 +1,1719 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "header.h" +#include "encodemv.h" +#include "entropymode.h" +#include "findnearmv.h" +#include "mcomp.h" +#include "systemdependent.h" +#include <assert.h> +#include <stdio.h> +#include "pragmas.h" +#include "vpx_mem/vpx_mem.h" +#include "bitstream.h" + +const int vp8cx_base_skip_false_prob[128] = +{ + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 251, 248, 244, 240, 236, 232, 229, 225, + 221, 217, 213, 208, 204, 199, 194, 190, + 187, 183, 179, 175, 172, 168, 164, 160, + 157, 153, 149, 145, 142, 138, 134, 130, + 127, 124, 120, 117, 114, 110, 107, 104, + 101, 98, 95, 92, 89, 86, 83, 80, + 77, 74, 71, 68, 65, 62, 59, 56, + 53, 50, 47, 44, 41, 38, 35, 32, + 30, 28, 26, 24, 22, 20, 18, 16, +}; +#ifdef VP8REF +#define __int64 long long +#endif + +#if defined(SECTIONBITS_OUTPUT) +unsigned __int64 Sectionbits[500]; +#endif + +#ifdef ENTROPY_STATS +int intra_mode_stats[10][10][10]; +static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] [2]; +extern unsigned int active_section; +#endif + +#ifdef MODE_STATS +int count_mb_seg[4] = { 0, 0, 0, 0 }; +#endif + +#if CONFIG_BIG_ENDIAN +# define make_endian_16(a) \ + (((unsigned int)(a & 0xff)) << 8) | (((unsigned int)(a & 0xff00)) >> 8) +# define make_endian_32(a) \ + (((unsigned int)(a & 0xff)) << 24) | (((unsigned int)(a & 0xff00)) << 8) | \ + (((unsigned int)(a & 0xff0000)) >> 8) | (((unsigned int)(a & 0xff000000)) >> 24) +#else +# define make_endian_16(a) a +# define make_endian_32(a) a +#endif + +static void update_mode( + vp8_writer *const w, + int n, + vp8_token tok [/* n */], + vp8_tree tree, + vp8_prob Pnew [/* n-1 */], + vp8_prob Pcur [/* n-1 */], + unsigned int bct [/* n-1 */] [2], + const unsigned int num_events[/* n */] +) +{ + unsigned int new_b = 0, old_b = 0; + int i = 0; + + vp8_tree_probs_from_distribution( + n--, tok, tree, + Pnew, bct, num_events, + 256, 1 + ); + + do + { + new_b += vp8_cost_branch(bct[i], Pnew[i]); + old_b += vp8_cost_branch(bct[i], Pcur[i]); + } + while (++i < n); + + if (new_b + (n << 8) < old_b) + { + int i = 0; + + vp8_write_bit(w, 1); + + do + { + const vp8_prob p = Pnew[i]; + + vp8_write_literal(w, Pcur[i] = p ? p : 1, 8); + } + while (++i < n); + } + else + vp8_write_bit(w, 0); +} + +static void update_mbintra_mode_probs(VP8_COMP *cpi) +{ + VP8_COMMON *const x = & cpi->common; + + vp8_writer *const w = & cpi->bc; + + { + vp8_prob Pnew [VP8_YMODES-1]; + unsigned int bct [VP8_YMODES-1] [2]; + + update_mode( + w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree, + Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count + ); + } + { + vp8_prob Pnew [VP8_UV_MODES-1]; + unsigned int bct [VP8_UV_MODES-1] [2]; + + update_mode( + w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree, + Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count + ); + } +} + +static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p) +{ + vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m); +} + +static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) +{ + vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m); +} + +static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p) +{ + vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m); +} + + +static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p) +{ + vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m); +} + +static void write_split(vp8_writer *bc, int x) +{ + vp8_write_token( + bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x + ); +} + +static const unsigned int norm[256] = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) +{ + const TOKENEXTRA *const stop = p + xcount; + unsigned int split; + unsigned int shift; + int count = w->count; + unsigned int range = w->range; + unsigned int lowvalue = w->lowvalue; + + while (p < stop) + { + const int t = p->Token; + vp8_token *const a = vp8_coef_encodings + t; + const vp8_extra_bit_struct *const b = vp8_extra_bits + t; + int i = 0; + const unsigned char *pp = p->context_tree; + int v = a->value; + int n = a->Len; + + if (p->skip_eob_node) + { + n--; + i = 2; + } + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = vp8_coef_tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + + + if (b->base_val) + { + const int e = p->Extra, L = b->Len; + + if (L) + { + const unsigned char *pp = b->prob; + int v = e >> 1; + int n = L; /* number of bits in v, assumed nonzero */ + int i = 0; + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = b->tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + } + + + { + + split = (range + 1) >> 1; + + if (e & 1) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + range <<= 1; + + if ((lowvalue & 0x80000000)) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + + } + + lowvalue <<= 1; + + if (!++count) + { + count = -8; + w->buffer[w->pos++] = (lowvalue >> 24); + lowvalue &= 0xffffff; + } + } + + } + + ++p; + } + + w->count = count; + w->lowvalue = lowvalue; + w->range = range; + +} + +static void write_partition_size(unsigned char *cx_data, int size) +{ + signed char csize; + + csize = size & 0xff; + *cx_data = csize; + csize = (size >> 8) & 0xff; + *(cx_data + 1) = csize; + csize = (size >> 16) & 0xff; + *(cx_data + 2) = csize; + +} + +static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, int num_part, int *size) +{ + + int i; + unsigned char *ptr = cx_data; + unsigned int shift; + vp8_writer *w = &cpi->bc2; + *size = 3 * (num_part - 1); + ptr = cx_data + (*size); + + for (i = 0; i < num_part; i++) + { + vp8_start_encode(w, ptr); + { + unsigned int split; + int count = w->count; + unsigned int range = w->range; + unsigned int lowvalue = w->lowvalue; + int mb_row; + + for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part) + { + TOKENEXTRA *p = cpi->tplist[mb_row].start; + TOKENEXTRA *stop = cpi->tplist[mb_row].stop; + + while (p < stop) + { + const int t = p->Token; + vp8_token *const a = vp8_coef_encodings + t; + const vp8_extra_bit_struct *const b = vp8_extra_bits + t; + int i = 0; + const unsigned char *pp = p->context_tree; + int v = a->value; + int n = a->Len; + + if (p->skip_eob_node) + { + n--; + i = 2; + } + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = vp8_coef_tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + + + if (b->base_val) + { + const int e = p->Extra, L = b->Len; + + if (L) + { + const unsigned char *pp = b->prob; + int v = e >> 1; + int n = L; /* number of bits in v, assumed nonzero */ + int i = 0; + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = b->tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + } + + { + split = (range + 1) >> 1; + + if (e & 1) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + range <<= 1; + + if ((lowvalue & 0x80000000)) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + + } + + lowvalue <<= 1; + + if (!++count) + { + count = -8; + w->buffer[w->pos++] = (lowvalue >> 24); + lowvalue &= 0xffffff; + } + } + + } + + ++p; + } + } + + w->count = count; + w->lowvalue = lowvalue; + w->range = range; + + } + + vp8_stop_encode(w); + *size += w->pos; + + if (i < (num_part - 1)) + { + write_partition_size(cx_data, w->pos); + cx_data += 3; + ptr += w->pos; + } + } +} + + +static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w) +{ + + unsigned int split; + int count = w->count; + unsigned int range = w->range; + unsigned int lowvalue = w->lowvalue; + unsigned int shift; + int mb_row; + + for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++) + { + TOKENEXTRA *p = cpi->tplist[mb_row].start; + TOKENEXTRA *stop = cpi->tplist[mb_row].stop; + + while (p < stop) + { + const int t = p->Token; + vp8_token *const a = vp8_coef_encodings + t; + const vp8_extra_bit_struct *const b = vp8_extra_bits + t; + int i = 0; + const unsigned char *pp = p->context_tree; + int v = a->value; + int n = a->Len; + + if (p->skip_eob_node) + { + n--; + i = 2; + } + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = vp8_coef_tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + + + if (b->base_val) + { + const int e = p->Extra, L = b->Len; + + if (L) + { + const unsigned char *pp = b->prob; + int v = e >> 1; + int n = L; /* number of bits in v, assumed nonzero */ + int i = 0; + + do + { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i>>1]) >> 8); + i = b->tree[i+bb]; + + if (bb) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + shift = norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + } + while (n); + } + + { + split = (range + 1) >> 1; + + if (e & 1) + { + lowvalue += split; + range = range - split; + } + else + { + range = split; + } + + range <<= 1; + + if ((lowvalue & 0x80000000)) + { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) + { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + + } + + lowvalue <<= 1; + + if (!++count) + { + count = -8; + w->buffer[w->pos++] = (lowvalue >> 24); + lowvalue &= 0xffffff; + } + } + + } + + ++p; + } + } + + w->count = count; + w->lowvalue = lowvalue; + w->range = range; + +} + +static void write_mv_ref +( + vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p +) +{ + + assert(NEARESTMV <= m && m <= SPLITMV); + + vp8_write_token(w, vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m); +} + +static void write_sub_mv_ref +( + vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p +) +{ + assert(LEFT4X4 <= m && m <= NEW4X4); + + vp8_write_token(w, vp8_sub_mv_ref_tree, p, VP8_SUBMVREFENCODINGS + m); +} + +static void write_mv +( + vp8_writer *w, const MV *mv, const MV *ref, const MV_CONTEXT *mvc +) +{ + MV e; + e.row = mv->row - ref->row; + e.col = mv->col - ref->col; + + vp8_encode_motion_vector(w, &e, mvc); +} + +static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x) +{ + // Encode the MB segment id. + if (x->segmentation_enabled && x->update_mb_segmentation_map) + { + switch (mi->segment_id) + { + case 0: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[1]); + break; + case 1: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 1, x->mb_segment_tree_probs[1]); + break; + case 2: + vp8_write(w, 1, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[2]); + break; + case 3: + vp8_write(w, 1, x->mb_segment_tree_probs[0]); + vp8_write(w, 1, x->mb_segment_tree_probs[2]); + break; + + // TRAP.. This should not happen + default: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[1]); + break; + } + } +} + + +static void pack_inter_mode_mvs(VP8_COMP *const cpi) +{ + VP8_COMMON *const pc = & cpi->common; + vp8_writer *const w = & cpi->bc; + const MV_CONTEXT *mvc = pc->fc.mvc; + + const int *const rfct = cpi->count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + MODE_INFO *m = pc->mi, *ms; + const int mis = pc->mode_info_stride; + int mb_row = -1; + + int prob_last_coded; + int prob_gf_coded; + int prob_skip_false = 0; + ms = pc->mi - 1; + + // Calculate the probabilities to be used to code the reference frame based on actual useage this frame + if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter))) + cpi->prob_intra_coded = 1; + + prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + if (!prob_last_coded) + prob_last_coded = 1; + + prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + if (!prob_gf_coded) + prob_gf_coded = 1; + + +#ifdef ENTROPY_STATS + active_section = 1; +#endif + + if (pc->mb_no_coeff_skip) + { + prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count); + + if (prob_skip_false <= 1) + prob_skip_false = 1; + + if (prob_skip_false > 255) + prob_skip_false = 255; + + cpi->prob_skip_false = prob_skip_false; + vp8_write_literal(w, prob_skip_false, 8); + } + + vp8_write_literal(w, cpi->prob_intra_coded, 8); + vp8_write_literal(w, prob_last_coded, 8); + vp8_write_literal(w, prob_gf_coded, 8); + + update_mbintra_mode_probs(cpi); + + vp8_write_mvprobs(cpi); + + while (++mb_row < pc->mb_rows) + { + int mb_col = -1; + + while (++mb_col < pc->mb_cols) + { + const MB_MODE_INFO *const mi = & m->mbmi; + const MV_REFERENCE_FRAME rf = mi->ref_frame; + const MB_PREDICTION_MODE mode = mi->mode; + + MACROBLOCKD *xd = &cpi->mb.e_mbd; + + // Distance of Mb to the various image edges. + // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + xd->mb_to_top_edge = -((mb_row * 16)) << 3; + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + +#ifdef ENTROPY_STATS + active_section = 9; +#endif + + if (cpi->mb.e_mbd.update_mb_segmentation_map) + write_mb_features(w, mi, &cpi->mb.e_mbd); + + if (pc->mb_no_coeff_skip) + vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false); + + if (rf == INTRA_FRAME) + { + vp8_write(w, 0, cpi->prob_intra_coded); +#ifdef ENTROPY_STATS + active_section = 6; +#endif + write_ymode(w, mode, pc->fc.ymode_prob); + + if (mode == B_PRED) + { + int j = 0; + + do + write_bmode(w, m->bmi[j].mode, pc->fc.bmode_prob); + + while (++j < 16); + } + + write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob); + } + else /* inter coded */ + { + MV best_mv; + vp8_prob mv_ref_p [VP8_MVREFS-1]; + + vp8_write(w, 1, cpi->prob_intra_coded); + + if (rf == LAST_FRAME) + vp8_write(w, 0, prob_last_coded); + else + { + vp8_write(w, 1, prob_last_coded); + vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded); + } + + { + MV n1, n2; + int ct[4]; + + vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias); + vp8_mv_ref_probs(mv_ref_p, ct); + +#ifdef ENTROPY_STATS + accum_mv_refs(mode, ct); +#endif + + } + +#ifdef ENTROPY_STATS + active_section = 3; +#endif + + write_mv_ref(w, mode, mv_ref_p); + + switch (mode) /* new, split require MVs */ + { + case NEWMV: + +#ifdef ENTROPY_STATS + active_section = 5; +#endif + + write_mv(w, &mi->mv.as_mv, &best_mv, mvc); + break; + + case SPLITMV: + { + int j = 0; + +#ifdef MODE_STATS + ++count_mb_seg [mi->partitioning]; +#endif + + write_split(w, mi->partitioning); + + do + { + const B_MODE_INFO *const b = mi->partition_bmi + j; + const int *const L = vp8_mbsplits [mi->partitioning]; + int k = -1; /* first block in subset j */ + int mv_contz; + + while (j != L[++k]) + if (k >= 16) + assert(0); + + mv_contz = vp8_mv_cont + (&(vp8_left_bmi(m, k)->mv.as_mv), + &(vp8_above_bmi(m, k, mis)->mv.as_mv)); + write_sub_mv_ref(w, b->mode, vp8_sub_mv_ref_prob2 [mv_contz]); //pc->fc.sub_mv_ref_prob); + + if (b->mode == NEW4X4) + { +#ifdef ENTROPY_STATS + active_section = 11; +#endif + write_mv(w, &b->mv.as_mv, &best_mv, (const MV_CONTEXT *) mvc); + } + } + while (++j < mi->partition_count); + } + break; + default: + break; + } + } + + ++m; + } + + ++m; /* skip L prediction border */ + } +} + + +static void write_kfmodes(VP8_COMP *cpi) +{ + vp8_writer *const bc = & cpi->bc; + const VP8_COMMON *const c = & cpi->common; + /* const */ + MODE_INFO *m = c->mi; + + int mb_row = -1; + int prob_skip_false = 0; + + if (c->mb_no_coeff_skip) + { + prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count); + + if (prob_skip_false <= 1) + prob_skip_false = 1; + + if (prob_skip_false >= 255) + prob_skip_false = 255; + + cpi->prob_skip_false = prob_skip_false; + vp8_write_literal(bc, prob_skip_false, 8); + } + + while (++mb_row < c->mb_rows) + { + int mb_col = -1; + + while (++mb_col < c->mb_cols) + { + const int ym = m->mbmi.mode; + + if (cpi->mb.e_mbd.update_mb_segmentation_map) + write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd); + + if (c->mb_no_coeff_skip) + vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false); + + kfwrite_ymode(bc, ym, c->kf_ymode_prob); + + if (ym == B_PRED) + { + const int mis = c->mode_info_stride; + int i = 0; + + do + { + const B_PREDICTION_MODE A = vp8_above_bmi(m, i, mis)->mode; + const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode; + const int bm = m->bmi[i].mode; + +#ifdef ENTROPY_STATS + ++intra_mode_stats [A] [L] [bm]; +#endif + + write_bmode(bc, bm, c->kf_bmode_prob [A] [L]); + } + while (++i < 16); + } + + write_uv_mode(bc, (m++)->mbmi.uv_mode, c->kf_uv_mode_prob); + } + + m++; // skip L prediction border + } +} +int vp8_estimate_entropy_savings(VP8_COMP *cpi) +{ + int i = 0; + int savings = 0; + + const int *const rfct = cpi->count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + int new_intra, new_last, gf_last, oldtotal, newtotal; + int ref_frame_cost[MAX_REF_FRAMES]; + + vp8_clear_system_state(); //__asm emms; + + if (cpi->common.frame_type != KEY_FRAME) + { + if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter))) + new_intra = 1; + + new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + // new costs + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(new_intra); + ref_frame_cost[LAST_FRAME] = vp8_cost_one(new_intra) + + vp8_cost_zero(new_last); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(new_intra) + + vp8_cost_one(new_last) + + vp8_cost_zero(gf_last); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(new_intra) + + vp8_cost_one(new_last) + + vp8_cost_one(gf_last); + + newtotal = + rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + + rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] + + rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] + + rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME]; + + + // old costs + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded); + ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(cpi->prob_last_coded); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_zero(cpi->prob_gf_coded); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_one(cpi->prob_gf_coded); + + oldtotal = + rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + + rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] + + rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] + + rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME]; + + savings += (oldtotal - newtotal) / 256; + } + + + do + { + int j = 0; + + do + { + int k = 0; + + do + { + /* at every context */ + + /* calc probs and branch cts for this frame only */ + //vp8_prob new_p [vp8_coef_tokens-1]; + //unsigned int branch_ct [vp8_coef_tokens-1] [2]; + + int t = 0; /* token/prob index */ + + vp8_tree_probs_from_distribution( + vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree, + cpi->frame_coef_probs [i][j][k], cpi->frame_branch_ct [i][j][k], cpi->coef_counts [i][j][k], + 256, 1 + ); + + do + { + const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t]; + const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t]; + + const vp8_prob old = cpi->common.fc.coef_probs [i][j][k][t]; + const vp8_prob upd = vp8_coef_update_probs [i][j][k][t]; + + const int old_b = vp8_cost_branch(ct, old); + const int new_b = vp8_cost_branch(ct, newp); + + const int update_b = 8 + + ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8); + + const int s = old_b - new_b - update_b; + + if (s > 0) + savings += s; + + + } + while (++t < vp8_coef_tokens - 1); + + + } + while (++k < PREV_COEF_CONTEXTS); + } + while (++j < COEF_BANDS); + } + while (++i < BLOCK_TYPES); + + return savings; +} + +static void update_coef_probs(VP8_COMP *cpi) +{ + int i = 0; + vp8_writer *const w = & cpi->bc; + int savings = 0; + + vp8_clear_system_state(); //__asm emms; + + + do + { + int j = 0; + + do + { + int k = 0; + + do + { + //note: use result from vp8_estimate_entropy_savings, so no need to call vp8_tree_probs_from_distribution here. + /* at every context */ + + /* calc probs and branch cts for this frame only */ + //vp8_prob new_p [vp8_coef_tokens-1]; + //unsigned int branch_ct [vp8_coef_tokens-1] [2]; + + int t = 0; /* token/prob index */ + + //vp8_tree_probs_from_distribution( + // vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree, + // new_p, branch_ct, (unsigned int *)cpi->coef_counts [i][j][k], + // 256, 1 + // ); + + do + { + const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t]; + const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t]; + + vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t; + const vp8_prob old = *Pold; + const vp8_prob upd = vp8_coef_update_probs [i][j][k][t]; + + const int old_b = vp8_cost_branch(ct, old); + const int new_b = vp8_cost_branch(ct, newp); + + const int update_b = 8 + + ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8); + + const int s = old_b - new_b - update_b; + const int u = s > 0 ? 1 : 0; + + vp8_write(w, u, upd); + + +#ifdef ENTROPY_STATS + ++ tree_update_hist [i][j][k][t] [u]; +#endif + + if (u) + { + /* send/use new probability */ + + *Pold = newp; + vp8_write_literal(w, newp, 8); + + savings += s; + + } + + } + while (++t < vp8_coef_tokens - 1); + + /* Accum token counts for generation of default statistics */ +#ifdef ENTROPY_STATS + t = 0; + + do + { + context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t]; + } + while (++t < vp8_coef_tokens); + +#endif + + } + while (++k < PREV_COEF_CONTEXTS); + } + while (++j < COEF_BANDS); + } + while (++i < BLOCK_TYPES); + +} +#ifdef PACKET_TESTING +FILE *vpxlogc = 0; +#endif + +static void put_delta_q(vp8_writer *bc, int delta_q) +{ + if (delta_q != 0) + { + vp8_write_bit(bc, 1); + vp8_write_literal(bc, abs(delta_q), 4); + + if (delta_q < 0) + vp8_write_bit(bc, 1); + else + vp8_write_bit(bc, 0); + } + else + vp8_write_bit(bc, 0); +} + +void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size) +{ + int i, j; + VP8_HEADER oh; + VP8_COMMON *const pc = & cpi->common; + vp8_writer *const bc = & cpi->bc; + MACROBLOCKD *const xd = & cpi->mb.e_mbd; + int extra_bytes_packed = 0; + + unsigned char *cx_data = dest; + const int *mb_feature_data_bits; + + oh.show_frame = (int) pc->show_frame; + oh.type = (int)pc->frame_type; + oh.version = pc->version; + + mb_feature_data_bits = vp8_mb_feature_data_bits; + cx_data += 3; + +#if defined(SECTIONBITS_OUTPUT) + Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256; +#endif + + //vp8_kf_default_bmode_probs() is called in vp8_setup_key_frame() once for each + //K frame before encode frame. pc->kf_bmode_prob doesn't get changed anywhere + //else. No need to call it again here. --yw + //vp8_kf_default_bmode_probs( pc->kf_bmode_prob); + + // every keyframe send startcode, width, height, scale factor, clamp and color type + if (oh.type == KEY_FRAME) + { + int w, h, hs, vs; + + // Start / synch code + cx_data[0] = 0x9D; + cx_data[1] = 0x01; + cx_data[2] = 0x2a; + + *((unsigned short *)(cx_data + 3)) = make_endian_16((pc->horiz_scale << 14) | pc->Width); + *((unsigned short *)(cx_data + 5)) = make_endian_16((pc->vert_scale << 14) | pc->Height); + + extra_bytes_packed = 7; + cx_data += extra_bytes_packed ; + + vp8_start_encode(bc, cx_data); + + // signal clr type + vp8_write_bit(bc, pc->clr_type); + vp8_write_bit(bc, pc->clamp_type); + + } + else + vp8_start_encode(bc, cx_data); + + + // Signal whether or not Segmentation is enabled + vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0); + + // Indicate which features are enabled + if (xd->segmentation_enabled) + { + // Signal whether or not the segmentation map is being updated. + vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0); + vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0); + + if (xd->update_mb_segmentation_data) + { + signed char Data; + + vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0); + + // For each segmentation feature (Quant and loop filter level) + for (i = 0; i < MB_LVL_MAX; i++) + { + // For each of the segments + for (j = 0; j < MAX_MB_SEGMENTS; j++) + { + Data = xd->segment_feature_data[i][j]; + + // Frame level data + if (Data) + { + vp8_write_bit(bc, 1); + + if (Data < 0) + { + Data = - Data; + vp8_write_literal(bc, Data, mb_feature_data_bits[i]); + vp8_write_bit(bc, 1); + } + else + { + vp8_write_literal(bc, Data, mb_feature_data_bits[i]); + vp8_write_bit(bc, 0); + } + } + else + vp8_write_bit(bc, 0); + } + } + } + + if (xd->update_mb_segmentation_map) + { + // Write the probs used to decode the segment id for each macro block. + for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) + { + int Data = xd->mb_segment_tree_probs[i]; + + if (Data != 255) + { + vp8_write_bit(bc, 1); + vp8_write_literal(bc, Data, 8); + } + else + vp8_write_bit(bc, 0); + } + } + } + + // Code to determine whether or not to update the scan order. + vp8_write_bit(bc, pc->filter_type); + vp8_write_literal(bc, pc->filter_level, 6); + vp8_write_literal(bc, pc->sharpness_level, 3); + + // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled). + vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0); + + if (xd->mode_ref_lf_delta_enabled) + { + // Do the deltas need to be updated + vp8_write_bit(bc, (xd->mode_ref_lf_delta_update) ? 1 : 0); + + if (xd->mode_ref_lf_delta_update) + { + int Data; + + // Send update + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + { + Data = xd->ref_lf_deltas[i]; + + // Frame level data + if (Data) + { + vp8_write_bit(bc, 1); + + if (Data > 0) + { + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 0); // sign + } + else + { + Data = -Data; + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 1); // sign + } + } + else + vp8_write_bit(bc, 0); + } + + // Send update + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + { + Data = xd->mode_lf_deltas[i]; + + if (Data) + { + vp8_write_bit(bc, 1); + + if (Data > 0) + { + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 0); // sign + } + else + { + Data = -Data; + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 1); // sign + } + } + else + vp8_write_bit(bc, 0); + } + } + } + + //signal here is multi token partition is enabled + vp8_write_literal(bc, pc->multi_token_partition, 2); + + // Frame Qbaseline quantizer index + vp8_write_literal(bc, pc->base_qindex, 7); + + // Transmit Dc, Second order and Uv quantizer delta information + put_delta_q(bc, pc->y1dc_delta_q); + put_delta_q(bc, pc->y2dc_delta_q); + put_delta_q(bc, pc->y2ac_delta_q); + put_delta_q(bc, pc->uvdc_delta_q); + put_delta_q(bc, pc->uvac_delta_q); + + // When there is a key frame all reference buffers are updated using the new key frame + if (pc->frame_type != KEY_FRAME) + { + // Should the GF or ARF be updated using the transmitted frame or buffer + vp8_write_bit(bc, pc->refresh_golden_frame); + vp8_write_bit(bc, pc->refresh_alt_ref_frame); + + // If not being updated from current frame should either GF or ARF be updated from another buffer + if (!pc->refresh_golden_frame) + vp8_write_literal(bc, pc->copy_buffer_to_gf, 2); + + if (!pc->refresh_alt_ref_frame) + vp8_write_literal(bc, pc->copy_buffer_to_arf, 2); + + // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer) + vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]); + vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]); + } + + vp8_write_bit(bc, pc->refresh_entropy_probs); + + if (pc->frame_type != KEY_FRAME) + vp8_write_bit(bc, pc->refresh_last_frame); + +#ifdef ENTROPY_STATS + + if (pc->frame_type == INTER_FRAME) + active_section = 0; + else + active_section = 7; + +#endif + + vp8_clear_system_state(); //__asm emms; + + //************************************************ + // save a copy for later refresh + { + vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc)); + } + + update_coef_probs(cpi); + +#ifdef ENTROPY_STATS + active_section = 2; +#endif + + // Write out the mb_no_coeff_skip flag + vp8_write_bit(bc, pc->mb_no_coeff_skip); + + if (pc->frame_type == KEY_FRAME) + { + write_kfmodes(cpi); + +#ifdef ENTROPY_STATS + active_section = 8; +#endif + } + else + { + pack_inter_mode_mvs(cpi); + +#ifdef ENTROPY_STATS + active_section = 1; +#endif + } + + vp8_stop_encode(bc); + + + if (pc->multi_token_partition != ONE_PARTITION) + { + int num_part; + int asize; + num_part = 1 << pc->multi_token_partition; + + pack_tokens_into_partitions(cpi, cx_data + bc->pos, num_part, &asize); + + oh.first_partition_length_in_bytes = cpi->bc.pos; + + *size = cpi->bc.pos + VP8_HEADER_SIZE + asize + extra_bytes_packed; + } + else + { + vp8_start_encode(&cpi->bc2, cx_data + bc->pos); + + if (!cpi->b_multi_threaded) + pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count); + else + pack_mb_row_tokens(cpi, &cpi->bc2); + + vp8_stop_encode(&cpi->bc2); + oh.first_partition_length_in_bytes = cpi->bc.pos ; + *size = cpi->bc2.pos + cpi->bc.pos + VP8_HEADER_SIZE + extra_bytes_packed; + } + +#if CONFIG_BIG_ENDIAN + { + int v = (oh.first_partition_length_in_bytes << 5) | + (oh.show_frame << 4) | + (oh.version << 1) | + oh.type; + + v = make_endian_32(v); + vpx_memcpy(dest, &v, 3); + } +#else + vpx_memcpy(dest, &oh, 3); +#endif +} + +#ifdef ENTROPY_STATS +void print_tree_update_probs() +{ + int i, j, k, l; + FILE *f = fopen("context.c", "a"); + int Sum; + fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); + fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] = {\n"); + + for (i = 0; i < BLOCK_TYPES; i++) + { + fprintf(f, " { \n"); + + for (j = 0; j < COEF_BANDS; j++) + { + fprintf(f, " {\n"); + + for (k = 0; k < PREV_COEF_CONTEXTS; k++) + { + fprintf(f, " {"); + + for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) + { + Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1]; + + if (Sum > 0) + { + if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0) + fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum); + else + fprintf(f, "%3ld, ", 1); + } + else + fprintf(f, "%3ld, ", 128); + } + + fprintf(f, "},\n"); + } + + fprintf(f, " },\n"); + } + + fprintf(f, " },\n"); + } + + fprintf(f, "};\n"); + fclose(f); +} +#endif diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h new file mode 100644 index 000000000..ee69f66e4 --- /dev/null +++ b/vp8/encoder/bitstream.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_BITSTREAM_H +#define __INC_BITSTREAM_H + +#if HAVE_ARMV7 +void vp8cx_pack_tokens_armv7(vp8_writer *w, const TOKENEXTRA *p, int xcount, + vp8_token *, + vp8_extra_bit_struct *, + const vp8_tree_index *); +void vp8cx_pack_tokens_into_partitions_armv7(VP8_COMP *, unsigned char *, int , int *, + vp8_token *, + vp8_extra_bit_struct *, + const vp8_tree_index *); +void vp8cx_pack_mb_row_tokens_armv7(VP8_COMP *cpi, vp8_writer *w, + vp8_token *, + vp8_extra_bit_struct *, + const vp8_tree_index *); +# define pack_tokens(a,b,c) \ + vp8cx_pack_tokens_armv7(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) +# define pack_tokens_into_partitions(a,b,c,d) \ + vp8cx_pack_tokens_into_partitions_armv7(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) +# define pack_mb_row_tokens(a,b) \ + vp8cx_pack_mb_row_tokens_armv7(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree) +#else +# define pack_tokens(a,b,c) pack_tokens_c(a,b,c) +# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d) +# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b) +#endif +#endif diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h new file mode 100644 index 000000000..cc4cbe067 --- /dev/null +++ b/vp8/encoder/block.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_BLOCK_H +#define __INC_BLOCK_H + +#include "onyx.h" +#include "blockd.h" +#include "entropymv.h" +#include "entropy.h" +#include "vpx_ports/mem.h" + +// motion search site +typedef struct +{ + MV mv; + int offset; +} search_site; + +typedef struct +{ + // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries + short *src_diff; + short *coeff; + + // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries + short(*quant)[4]; + short(*zbin)[4]; + short(*zrun_zbin_boost); + short(*round)[4]; + + // Zbin Over Quant value + short zbin_extra; + + unsigned char **base_src; + int src; + int src_stride; + +// MV enc_mv; + int force_empty; + +} BLOCK; + +typedef struct +{ + DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y + DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y + + // 16 Y blocks, 4 U blocks, 4 V blocks, 1 DC 2nd order block each with 16 entries + BLOCK block[25]; + + YV12_BUFFER_CONFIG src; + + MACROBLOCKD e_mbd; + + search_site *ss; + int ss_count; + int searches_per_step; + + int errorperbit; + int sadperbit16; + int sadperbit4; + int errthresh; + int rddiv; + int rdmult; + + int mvcosts[2][MVvals+1]; + int *mvcost[2]; + int mvsadcosts[2][MVvals+1]; + int *mvsadcost[2]; + int mbmode_cost[2][MB_MODE_COUNT]; + int intra_uv_mode_cost[2][MB_MODE_COUNT]; + unsigned int bmode_costs[10][10][10]; + unsigned int inter_bmode_costs[B_MODE_COUNT]; + + // These define limits to motion vector components to prevent them from extending outside the UMV borders + int mv_col_min; + int mv_col_max; + int mv_row_min; + int mv_row_max; + + int vector_range; // Used to monitor limiting range of recent vectors to guide search. + int skip; + + int encode_breakout; + + unsigned char *active_ptr; + MV_CONTEXT *mvc; + + unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]; + int optimize; + + void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + void (*short_fdct4x4rd)(short *input, short *output, int pitch); + void (*short_fdct8x4rd)(short *input, short *output, int pitch); + void (*vp8_short_fdct4x4_ptr)(short *input, short *output, int pitch); + void (*short_walsh4x4)(short *input, short *output, int pitch); + + void (*quantize_b)(BLOCK *b, BLOCKD *d); + void (*quantize_brd)(BLOCK *b, BLOCKD *d); + + + +} MACROBLOCK; + + +#endif diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c new file mode 100644 index 000000000..c101384d9 --- /dev/null +++ b/vp8/encoder/boolhuff.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "boolhuff.h" +#include "blockd.h" + + + +#if defined(SECTIONBITS_OUTPUT) +unsigned __int64 Sectionbits[500]; + +#endif + +#ifdef ENTROPY_STATS +unsigned int active_section = 0; +#endif + +const unsigned int vp8_prob_cost[256] = +{ + 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, + 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, + 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, + 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, + 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, + 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, + 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, + 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, + 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, + 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, + 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, + 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, + 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, + 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, + 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 +}; + +void vp8_start_encode(BOOL_CODER *br, unsigned char *source) +{ + + br->lowvalue = 0; + br->range = 255; + br->value = 0; + br->count = -24; + br->buffer = source; + br->pos = 0; +} + +void vp8_stop_encode(BOOL_CODER *br) +{ + int i; + + for (i = 0; i < 32; i++) + vp8_encode_bool(br, 0, 128); +} + +DECLARE_ALIGNED(16, static const unsigned int, norm[256]) = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) +{ + unsigned int split; + int count = br->count; + unsigned int range = br->range; + unsigned int lowvalue = br->lowvalue; + register unsigned int shift; + +#ifdef ENTROPY_STATS +#if defined(SECTIONBITS_OUTPUT) + + if (bit) + Sectionbits[active_section] += vp8_prob_cost[255-probability]; + else + Sectionbits[active_section] += vp8_prob_cost[probability]; + +#endif +#endif + + split = 1 + (((range - 1) * probability) >> 8); + + range = split; + + if (bit) + { + lowvalue += split; + range = br->range - split; + } + + shift = norm[range]; + + range <<= shift; + count += shift; + + if (count >= 0) + { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) + { + int x = br->pos - 1; + + while (x >= 0 && br->buffer[x] == 0xff) + { + br->buffer[x] = (unsigned char)0; + x--; + } + + br->buffer[x] += 1; + } + + br->buffer[br->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8 ; + } + + lowvalue <<= shift; + br->count = count; + br->lowvalue = lowvalue; + br->range = range; +} + +void vp8_encode_value(BOOL_CODER *br, int data, int bits) +{ + int bit; + + for (bit = bits - 1; bit >= 0; bit--) + vp8_encode_bool(br, (1 & (data >> bit)), 0x80); + +} diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h new file mode 100644 index 000000000..0d929f067 --- /dev/null +++ b/vp8/encoder/boolhuff.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : boolhuff.h +* +* Description : Bool Coder header file. +* +****************************************************************************/ +#ifndef __INC_BOOLHUFF_H +#define __INC_BOOLHUFF_H + + +typedef struct +{ + unsigned int lowvalue; + unsigned int range; + unsigned int value; + int count; + unsigned int pos; + unsigned char *buffer; + + // Variables used to track bit costs without outputing to the bitstream + unsigned int measure_cost; + unsigned long bit_counter; +} BOOL_CODER; + +extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer); +extern void vp8_encode_bool(BOOL_CODER *bc, int x, int context); +extern void vp8_encode_value(BOOL_CODER *br, int data, int bits); +extern void vp8_stop_encode(BOOL_CODER *bc); +extern const unsigned int vp8_prob_cost[256]; + +#endif diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c new file mode 100644 index 000000000..5207e39c4 --- /dev/null +++ b/vp8/encoder/dct.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> + + +static const short dct_matrix2[4][4] = +{ + { 23170, 30274, 23170, 12540 }, + { 23170, 12540, -23170, -30274 }, + { 23170, -12540, -23170, 30274 }, + { 23170, -30274, 23170, -12540 } +}; + +static const short dct_matrix1[4][4] = +{ + { 23170, 23170, 23170, 23170 }, + { 30274, 12540, -12540, -30274 }, + { 23170, -23170, -23170, 23170 }, + { 12540, -30274, 30274, -12540 } +}; + + +#define _1STSTAGESHIFT 14 +#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1)) +#define _2NDSTAGESHIFT 16 +#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1)) + +// using matrix multiply +void vp8_short_fdct4x4_c(short *input, short *output, int pitch) +{ + int i, j, k; + short temp[4][4]; + int sumtemp; + pitch >>= 1; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + sumtemp = 0; + + for (k = 0; k < 4; k++) + { + sumtemp += input[i*pitch+k] * dct_matrix2[k][j]; + + } + + temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT); + } + } + + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + sumtemp = 0; + + for (k = 0; k < 4; k++) + { + sumtemp += dct_matrix1[i][ k] * temp[k][ j]; + } + + output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT); + } + } + +} + + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_c(input, output, pitch); + vp8_short_fdct4x4_c(input + 4, output + 16, pitch); +} + + +static const signed short x_c1 = 60547; +static const signed short x_c2 = 46341; +static const signed short x_c3 = 25080; + +void vp8_fast_fdct4x4_c(short *input, short *output, int pitch) +{ + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + + short *op = output; + int temp1, temp2; + + for (i = 0; i < 4; i++) + { + a1 = (ip[0] + ip[3]) * 2; + b1 = (ip[1] + ip[2]) * 2; + c1 = (ip[1] - ip[2]) * 2; + d1 = (ip[0] - ip[3]) * 2; + + temp1 = a1 + b1; + temp2 = a1 - b1; + + op[0] = ((temp1 * x_c2) >> 16) + temp1; + op[2] = ((temp2 * x_c2) >> 16) + temp2; + + temp1 = (c1 * x_c3) >> 16; + temp2 = ((d1 * x_c1) >> 16) + d1; + + op[1] = temp1 + temp2; + + temp1 = (d1 * x_c3) >> 16; + temp2 = ((c1 * x_c1) >> 16) + c1; + + op[3] = temp1 - temp2; + + ip += pitch / 2; + op += 4; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + + temp1 = a1 + b1; + temp2 = a1 - b1; + + a2 = ((temp1 * x_c2) >> 16) + temp1; + c2 = ((temp2 * x_c2) >> 16) + temp2; + + temp1 = (c1 * x_c3) >> 16; + temp2 = ((d1 * x_c1) >> 16) + d1; + + b2 = temp1 + temp2; + + temp1 = (d1 * x_c3) >> 16; + temp2 = ((c1 * x_c1) >> 16) + c1; + + d2 = temp1 - temp2; + + + op[0] = (a2 + 1) >> 1; + op[4] = (b2 + 1) >> 1; + op[8] = (c2 + 1) >> 1; + op[12] = (d2 + 1) >> 1; + + ip++; + op++; + } +} + +void vp8_fast_fdct8x4_c(short *input, short *output, int pitch) +{ + vp8_fast_fdct4x4_c(input, output, pitch); + vp8_fast_fdct4x4_c(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch) +{ + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + op[0] = a1 + b1; + op[1] = c1 + d1; + op[2] = a1 - b1; + op[3] = d1 - c1; + ip += pitch / 2; + op += 4; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + a2 += (a2 > 0); + b2 += (b2 > 0); + c2 += (c2 > 0); + d2 += (d2 > 0); + + op[0] = (a2) >> 1; + op[4] = (b2) >> 1; + op[8] = (c2) >> 1; + op[12] = (d2) >> 1; + + ip++; + op++; + } +} diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h new file mode 100644 index 000000000..fb307cfb3 --- /dev/null +++ b/vp8/encoder/dct.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_DCT_H +#define __INC_DCT_H + +#define prototype_fdct(sym) void (sym)(short *input, short *output, int pitch) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/dct_x86.h" +#endif + +#if ARCH_ARM +#include "arm/dct_arm.h" +#endif + +#ifndef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_c +#endif +extern prototype_fdct(vp8_fdct_short4x4); + +#ifndef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_c +#endif +extern prototype_fdct(vp8_fdct_short8x4); + +#ifndef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c +#endif +extern prototype_fdct(vp8_fdct_fast4x4); + +#ifndef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c +#endif +extern prototype_fdct(vp8_fdct_fast8x4); + +#ifndef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c +#endif +extern prototype_fdct(vp8_fdct_walsh_short4x4); + +typedef prototype_fdct(*vp8_fdct_fn_t); +typedef struct +{ + vp8_fdct_fn_t short4x4; + vp8_fdct_fn_t short8x4; + vp8_fdct_fn_t fast4x4; + vp8_fdct_fn_t fast8x4; + vp8_fdct_fn_t walsh_short4x4; +} vp8_fdct_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define FDCT_INVOKE(ctx,fn) (ctx)->fn +#else +#define FDCT_INVOKE(ctx,fn) vp8_fdct_##fn +#endif + +#endif diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c new file mode 100644 index 000000000..a4e377220 --- /dev/null +++ b/vp8/encoder/encodeframe.c @@ -0,0 +1,1223 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "encodemb.h" +#include "encodemv.h" +#include "common.h" +#include "onyx_int.h" +#include "extend.h" +#include "entropymode.h" +#include "quant_common.h" +#include "segmentation_common.h" +#include "setupintrarecon.h" +#include "encodeintra.h" +#include "reconinter.h" +#include "rdopt.h" +#include "pickinter.h" +#include "findnearmv.h" +#include "reconintra.h" +#include <stdio.h> +#include <limits.h> +#include "subpixel.h" +#include "vpx_ports/vpx_timer.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#define RTCD(x) &cpi->common.rtcd.x +#define IF_RTCD(x) (x) +#else +#define RTCD(x) NULL +#define IF_RTCD(x) NULL +#endif +extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ; + +extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); +extern void vp8_auto_select_speed(VP8_COMP *cpi); +extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, + MACROBLOCK *x, + MB_ROW_COMP *mbr_ei, + int mb_row, + int count); +void vp8_build_block_offsets(MACROBLOCK *x); +void vp8_setup_block_ptrs(MACROBLOCK *x); +int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset); +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); + +#ifdef MODE_STATS +unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +unsigned int inter_uv_modes[4] = {0, 0, 0, 0}; +unsigned int inter_b_modes[15] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +unsigned int y_modes[5] = {0, 0, 0, 0, 0}; +unsigned int uv_modes[4] = {0, 0, 0, 0}; +unsigned int b_modes[14] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +#endif + +// The first four entries are dummy values +static const int qrounding_factors[129] = +{ + 56, 56, 56, 56, 56, 56, 56, 56, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, +}; + +static const int qzbin_factors[129] = +{ + 64, 64, 64, 64, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, +}; + +void vp8cx_init_quantizer(VP8_COMP *cpi) +{ + int r, c; + int i; + int quant_val; + int Q; + + int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44}; + + for (Q = 0; Q < QINDEX_RANGE; Q++) + { + // dc values + quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q); + cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val; + cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y1dequant[Q][0][0] = quant_val; + cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q); + cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val; + cpi->Y2zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y2round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y2dequant[Q][0][0] = quant_val; + cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q); + cpi->UVquant[Q][0][0] = (1 << 16) / quant_val; + cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;; + cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.UVdequant[Q][0][0] = quant_val; + cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + // all the ac values = ; + for (i = 1; i < 16; i++) + { + int rc = vp8_default_zig_zag1d[i]; + r = (rc >> 2); + c = (rc & 3); + + quant_val = vp8_ac_yquant(Q); + cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val; + cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y1dequant[Q][r][c] = quant_val; + cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7; + + quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); + cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val; + cpi->Y2zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y2round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y2dequant[Q][r][c] = quant_val; + cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7; + + quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); + cpi->UVquant[Q][r][c] = (1 << 16) / quant_val; + cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.UVdequant[Q][r][c] = quant_val; + cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7; + } + } +} + +void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) +{ + int i; + int QIndex; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mbmi; + int zbin_extra; + + // Select the baseline MB Q index. + if (xd->segmentation_enabled) + { + // Abs Value + if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) + QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + + // Delta Value + else + { + QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; // Clamp to valid range + } + } + else + QIndex = cpi->common.base_qindex; + + // Y + zbin_extra = (cpi->common.Y1dequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7; + + for (i = 0; i < 16; i++) + { + x->block[i].quant = cpi->Y1quant[QIndex]; + x->block[i].zbin = cpi->Y1zbin[QIndex]; + x->block[i].round = cpi->Y1round[QIndex]; + x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex]; + x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex]; + x->block[i].zbin_extra = (short)zbin_extra; + } + + // UV + zbin_extra = (cpi->common.UVdequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7; + + for (i = 16; i < 24; i++) + { + x->block[i].quant = cpi->UVquant[QIndex]; + x->block[i].zbin = cpi->UVzbin[QIndex]; + x->block[i].round = cpi->UVround[QIndex]; + x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex]; + x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex]; + x->block[i].zbin_extra = (short)zbin_extra; + } + + // Y2 + zbin_extra = (cpi->common.Y2dequant[QIndex][0][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7; + x->block[24].quant = cpi->Y2quant[QIndex]; + x->block[24].zbin = cpi->Y2zbin[QIndex]; + x->block[24].round = cpi->Y2round[QIndex]; + x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex]; + x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex]; + x->block[24].zbin_extra = (short)zbin_extra; +} + +void vp8cx_frame_init_quantizer(VP8_COMP *cpi) +{ + // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called + // when these values are not all zero. + if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q) + { + vp8cx_init_quantizer(cpi); + } + + // MB level quantizer setup + vp8cx_mb_init_quantizer(cpi, &cpi->mb); +} + + + +static +void encode_mb_row(VP8_COMP *cpi, + VP8_COMMON *cm, + int mb_row, + MACROBLOCK *x, + MACROBLOCKD *xd, + TOKENEXTRA **tp, + int *segment_counts, + int *totalrate) +{ + int i; + int recon_yoffset, recon_uvoffset; + int mb_col; + int recon_y_stride = cm->last_frame.y_stride; + int recon_uv_stride = cm->last_frame.uv_stride; + int seg_map_index = (mb_row * cpi->common.mb_cols); + + + // reset above block coeffs + xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT]; + xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ]; + xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ]; + xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT]; + + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + cpi->tplist[mb_row].start = *tp; + //printf("Main mb_row = %d\n", mb_row); + + // for each macroblock col in image + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + // Distance of Mb to the various image edges. + // These specified to 8th pel as they are always compared to values that are in 1/8th pel units + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3; + + // Set up limit values for motion vectors used to prevent them extending outside the UMV borders + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset; + xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset; + xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + // Is segmentation enabled + // MB level adjutment to quantizer + if (xd->segmentation_enabled) + { + // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking) + if (cpi->segmentation_map[seg_map_index+mb_col] <= 3) + xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col]; + else + xd->mbmi.segment_id = 0; + + vp8cx_mb_init_quantizer(cpi, x); + } + else + xd->mbmi.segment_id = 0; // Set to Segment 0 by default + + x->active_ptr = cpi->active_map + seg_map_index + mb_col; + + if (cm->frame_type == KEY_FRAME) + { + *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp); +#ifdef MODE_STATS + y_modes[xd->mbmi.mode] ++; +#endif + } + else + { + *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset); + +#ifdef MODE_STATS + inter_y_modes[xd->mbmi.mode] ++; + + if (xd->mbmi.mode == SPLITMV) + { + int b; + + for (b = 0; b < xd->mbmi.partition_count; b++) + { + inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++; + } + } + +#endif + + // Count of last ref frame 0,0 useage + if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME)) + cpi->inter_zz_count ++; + + // Special case code for cyclic refresh + // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode + // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map + if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled) + { + cpi->segmentation_map[seg_map_index+mb_col] = xd->mbmi.segment_id; + + // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh): + // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0) + // else mark it as dirty (1). + if (xd->mbmi.segment_id) + cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1; + else if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME)) + { + if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1) + cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0; + } + else + cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1; + + } + } + + cpi->tplist[mb_row].stop = *tp; + + xd->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb + + // store macroblock mode info into context array + vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi)); + + for (i = 0; i < 16; i++) + vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi)); + + // adjust to the next column of macroblocks + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + // Keep track of segment useage + segment_counts[xd->mbmi.segment_id] ++; + + // skip to next mb + xd->mode_info_context++; + + xd->above_context[Y1CONTEXT] += 4; + xd->above_context[UCONTEXT ] += 2; + xd->above_context[VCONTEXT ] += 2; + xd->above_context[Y2CONTEXT] ++; + cpi->current_mb_col_main = mb_col; + } + + //extend the recon for intra prediction + vp8_extend_mb_row( + &cm->new_frame, + xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, + xd->dst.v_buffer + 8); + + // this is to account for the border + xd->mode_info_context++; +} + + + + + +void vp8_encode_frame(VP8_COMP *cpi) +{ + int mb_row; + MACROBLOCK *const x = & cpi->mb; + VP8_COMMON *const cm = & cpi->common; + MACROBLOCKD *const xd = & x->e_mbd; + + int i; + TOKENEXTRA *tp = cpi->tok; + int segment_counts[MAX_MB_SEGMENTS]; + int totalrate; + + if (cm->frame_type != KEY_FRAME) + { + if (cm->mcomp_filter_type == SIXTAP) + { + xd->subpixel_predict = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap4x4); + xd->subpixel_predict8x4 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x4); + xd->subpixel_predict8x8 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x8); + xd->subpixel_predict16x16 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap16x16); + } + else + { + xd->subpixel_predict = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear4x4); + xd->subpixel_predict8x4 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x4); + xd->subpixel_predict8x8 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x8); + xd->subpixel_predict16x16 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear16x16); + } + } + + //else // Key Frame + //{ + // For key frames make sure the intra ref frame probability value + // is set to "all intra" + //cpi->prob_intra_coded = 255; + //} + + + xd->gf_active_ptr = (signed char *)cm->gf_active_flags; // Point to base of GF active flags data structure + + x->vector_range = 32; + + // Count of MBs using the alternate Q if any + cpi->alt_qcount = 0; + + // Reset frame count of inter 0,0 motion vector useage. + cpi->inter_zz_count = 0; + + vpx_memset(segment_counts, 0, sizeof(segment_counts)); + + cpi->prediction_error = 0; + cpi->intra_error = 0; + cpi->skip_true_count = 0; + cpi->skip_false_count = 0; + +#if 0 + // Experimental code + cpi->frame_distortion = 0; + cpi->last_mb_distortion = 0; +#endif + + totalrate = 0; + + xd->mode_info = cm->mi - 1; + + xd->mode_info_context = cm->mi; + xd->mode_info_stride = cm->mode_info_stride; + + xd->frame_type = cm->frame_type; + + xd->frames_since_golden = cm->frames_since_golden; + xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame; + vp8_zero(cpi->MVcount); + // vp8_zero( Contexts) + vp8_zero(cpi->coef_counts); + + // reset intra mode contexts + if (cm->frame_type == KEY_FRAME) + vp8_init_mbmode_probs(cm); + + + vp8cx_frame_init_quantizer(cpi); + + if (cpi->compressor_speed == 2) + { + if (cpi->oxcf.cpu_used < 0) + cpi->Speed = -(cpi->oxcf.cpu_used); + else + vp8_auto_select_speed(cpi); + } + + vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q)); + //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) ); + vp8cx_initialize_me_consts(cpi, cm->base_qindex); + //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex); + + // Copy data over into macro block data sturctures. + + x->src = * cpi->Source; + xd->pre = cm->last_frame; + xd->dst = cm->new_frame; + + // set up frame new frame for intra coded blocks + + vp8_setup_intra_recon(&cm->new_frame); + + vp8_build_block_offsets(x); + + vp8_setup_block_dptrs(&x->e_mbd); + + vp8_setup_block_ptrs(x); + + x->rddiv = cpi->RDDIV; + x->rdmult = cpi->RDMULT; + +#if 0 + // Experimental rd code + // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics + // such as cpi->rate_correction_factor that indicate relative complexity. + /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) ) + { + //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb; + x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor); + } + else + x->rdmult = cpi->RDMULT; */ + //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 )); +#endif + + xd->mbmi.mode = DC_PRED; + xd->mbmi.uv_mode = DC_PRED; + + xd->left_context = cm->left_context; + + vp8_zero(cpi->count_mb_ref_frame_usage) + vp8_zero(cpi->ymode_count) + vp8_zero(cpi->uv_mode_count) + + x->mvc = cm->fc.mvc; + + // vp8_zero( entropy_stats) + { + ENTROPY_CONTEXT **p = cm->above_context; + const size_t L = cm->mb_cols; + + vp8_zero_array(p [Y1CONTEXT], L * 4) + vp8_zero_array(p [ UCONTEXT], L * 2) + vp8_zero_array(p [ VCONTEXT], L * 2) + vp8_zero_array(p [Y2CONTEXT], L) + } + + + { + struct vpx_usec_timer emr_timer; + vpx_usec_timer_start(&emr_timer); + + if (!cpi->b_multi_threaded) + { + // for each macroblock row in image + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + + vp8_zero(cm->left_context) + + encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); + + // adjust to the next row of mbs + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + } + + cpi->tok_count = tp - cpi->tok; + + } + else + { +#if CONFIG_MULTITHREAD + vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1, cpi->encoding_thread_count); + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) + { + int i; + cpi->current_mb_col_main = -1; + + for (i = 0; i < cpi->encoding_thread_count; i++) + { + if ((mb_row + i + 1) >= cm->mb_rows) + break; + + cpi->mb_row_ei[i].mb_row = mb_row + i + 1; + cpi->mb_row_ei[i].tp = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24); + cpi->mb_row_ei[i].current_mb_col = -1; + //SetEvent(cpi->h_event_mbrencoding[i]); + sem_post(&cpi->h_event_mbrencoding[i]); + } + + vp8_zero(cm->left_context) + + tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); + + encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); + + // adjust to the next row of mbs + x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; + + xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count; + + if (mb_row < cm->mb_rows - 1) + //WaitForSingleObject(cpi->h_event_main, INFINITE); + sem_wait(&cpi->h_event_main); + } + + /* + for( ;mb_row<cm->mb_rows; mb_row ++) + { + vp8_zero( cm->left_context) + + tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); + + encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); + // adjust to the next row of mbs + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + + } + */ + cpi->tok_count = 0; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++) + { + cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start; + } + + if (xd->segmentation_enabled) + { + + int i, j; + + if (xd->segmentation_enabled) + { + + for (i = 0; i < cpi->encoding_thread_count; i++) + { + for (j = 0; j < 4; j++) + segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j]; + } + } + + } + + for (i = 0; i < cpi->encoding_thread_count; i++) + { + totalrate += cpi->mb_row_ei[i].totalrate; + } + +#endif + + } + + vpx_usec_timer_mark(&emr_timer); + cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); + + } + + + // Work out the segment probabilites if segmentation is enabled + if (xd->segmentation_enabled) + { + int tot_count; + int i; + + // Set to defaults + vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs)); + + tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3]; + + if (tot_count) + { + xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count; + + tot_count = segment_counts[0] + segment_counts[1]; + + if (tot_count > 0) + { + xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count; + } + + tot_count = segment_counts[2] + segment_counts[3]; + + if (tot_count > 0) + xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count; + + // Zero probabilities not allowed + for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++) + { + if (xd->mb_segment_tree_probs[i] == 0) + xd->mb_segment_tree_probs[i] = 1; + } + } + } + + // 256 rate units to the bit + cpi->projected_frame_size = totalrate >> 8; // projected_frame_size in units of BYTES + + // Make a note of the percentage MBs coded Intra. + if (cm->frame_type == KEY_FRAME) + { + cpi->this_frame_percent_intra = 100; + } + else + { + int tot_modes; + + tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME] + + cpi->count_mb_ref_frame_usage[LAST_FRAME] + + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] + + cpi->count_mb_ref_frame_usage[ALTREF_FRAME]; + + if (tot_modes) + cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes; + + } + +#if 0 + { + int cnt = 0; + int flag[2] = {0, 0}; + + for (cnt = 0; cnt < MVPcount; cnt++) + { + if (cm->fc.pre_mvc[0][cnt] != cm->fc.mvc[0][cnt]) + { + flag[0] = 1; + vpx_memcpy(cm->fc.pre_mvc[0], cm->fc.mvc[0], MVPcount); + break; + } + } + + for (cnt = 0; cnt < MVPcount; cnt++) + { + if (cm->fc.pre_mvc[1][cnt] != cm->fc.mvc[1][cnt]) + { + flag[1] = 1; + vpx_memcpy(cm->fc.pre_mvc[1], cm->fc.mvc[1], MVPcount); + break; + } + } + + if (flag[0] || flag[1]) + vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag); + } +#endif + + // Adjust the projected reference frame useage probability numbers to reflect + // what we have just seen. This may be usefull when we make multiple itterations + // of the recode loop rather than continuing to use values from the previous frame. + if ((cm->frame_type != KEY_FRAME) && !cm->refresh_alt_ref_frame && !cm->refresh_golden_frame) + { + const int *const rfct = cpi->count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + if ((rf_intra + rf_inter) > 0) + { + cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); + + if (cpi->prob_intra_coded < 1) + cpi->prob_intra_coded = 1; + + if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active) + { + cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + if (cpi->prob_last_coded < 1) + cpi->prob_last_coded = 1; + + cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + if (cpi->prob_gf_coded < 1) + cpi->prob_gf_coded = 1; + } + } + } + +#if 0 + // Keep record of the total distortion this time around for future use + cpi->last_frame_distortion = cpi->frame_distortion; +#endif + +} +void vp8_setup_block_ptrs(MACROBLOCK *x) +{ + int r, c; + int i; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4; + } + } + + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4; + } + } + + x->block[24].src_diff = x->src_diff + 384; + + + for (i = 0; i < 25; i++) + { + x->block[i].coeff = x->coeff + i * 16; + } +} + +void vp8_build_block_offsets(MACROBLOCK *x) +{ + int block = 0; + int br, bc; + + vp8_build_block_doffsets(&x->e_mbd); + + // y blocks + for (br = 0; br < 4; br++) + { + for (bc = 0; bc < 4; bc++) + { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->src.y_buffer; + this_block->src_stride = x->src.y_stride; + this_block->src = 4 * br * this_block->src_stride + 4 * bc; + ++block; + } + } + + // u blocks + for (br = 0; br < 2; br++) + { + for (bc = 0; bc < 2; bc++) + { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->src.u_buffer; + this_block->src_stride = x->src.uv_stride; + this_block->src = 4 * br * this_block->src_stride + 4 * bc; + ++block; + } + } + + // v blocks + for (br = 0; br < 2; br++) + { + for (bc = 0; bc < 2; bc++) + { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->src.v_buffer; + this_block->src_stride = x->src.uv_stride; + this_block->src = 4 * br * this_block->src_stride + 4 * bc; + ++block; + } + } +} + +static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) +{ + const MACROBLOCKD *xd = & x->e_mbd; + const MB_PREDICTION_MODE m = xd->mbmi.mode; + const MB_PREDICTION_MODE uvm = xd->mbmi.uv_mode; + +#ifdef MODE_STATS + const int is_key = cpi->common.frame_type == KEY_FRAME; + + ++ (is_key ? uv_modes : inter_uv_modes)[uvm]; + + if (m == B_PRED) + { + unsigned int *const bct = is_key ? b_modes : inter_b_modes; + + int b = 0; + + do + { + ++ bct[xd->block[b].bmi.mode]; + } + while (++b < 16); + } + +#endif + + ++cpi->ymode_count[m]; + ++cpi->uv_mode_count[uvm]; + +} +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) +{ + int Error4x4, Error16x16, error_uv; + B_PREDICTION_MODE intra_bmodes[16]; + int rate4x4, rate16x16, rateuv; + int dist4x4, dist16x16, distuv; + int rate = 0; + int rate4x4_tokenonly = 0; + int rate16x16_tokenonly = 0; + int rateuv_tokenonly = 0; + int i; + + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->sf.RD || cpi->compressor_speed != 2) + { + Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4); + + //save the b modes for possible later use + for (i = 0; i < 16; i++) + intra_bmodes[i] = x->e_mbd.block[i].bmi.mode; + + Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16); + + error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv); + + x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; + + vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x); + rate += rateuv; + + if (Error4x4 < Error16x16) + { + rate += rate4x4; + x->e_mbd.mbmi.mode = B_PRED; + + // get back the intra block modes + for (i = 0; i < 16; i++) + x->e_mbd.block[i].bmi.mode = intra_bmodes[i]; + + vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x); + cpi->prediction_error += Error4x4 ; +#if 0 + // Experimental RD code + cpi->frame_distortion += dist4x4; +#endif + } + else + { + vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x); + rate += rate16x16; + +#if 0 + // Experimental RD code + cpi->prediction_error += Error16x16; + cpi->frame_distortion += dist16x16; +#endif + } + + sum_intra_stats(cpi, x); + + vp8_tokenize_mb(cpi, &x->e_mbd, t); + } + else +#endif + { + + int rate2, distortion2; + MB_PREDICTION_MODE mode, best_mode = DC_PRED; + int this_rd; + Error16x16 = INT_MAX; + + for (mode = DC_PRED; mode <= TM_PRED; mode ++) + { + x->e_mbd.mbmi.mode = mode; + vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); + rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode]; + this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + + if (Error16x16 > this_rd) + { + Error16x16 = this_rd; + best_mode = mode; + } + } + + vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2); + + if (distortion2 == INT_MAX) + Error4x4 = INT_MAX; + else + Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + + x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; + + if (Error4x4 < Error16x16) + { + x->e_mbd.mbmi.mode = B_PRED; + vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x); + cpi->prediction_error += Error4x4; + } + else + { + x->e_mbd.mbmi.mode = best_mode; + vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x); + cpi->prediction_error += Error16x16; + } + + vp8_pick_intra_mbuv_mode(x); + vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x); + sum_intra_stats(cpi, x); + vp8_tokenize_mb(cpi, &x->e_mbd, t); + } + + return rate; +} +#ifdef SPEEDSTATS +extern int cnt_pm; +#endif + +extern void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x); + +int vp8cx_encode_inter_macroblock +( + VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, + int recon_yoffset, int recon_uvoffset +) +{ + MACROBLOCKD *const xd = &x->e_mbd; + int inter_error; + int intra_error = 0; + int rate; + int distortion; + + x->skip = 0; + + if (xd->segmentation_enabled) + x->encode_breakout = cpi->segment_encode_breakout[xd->mbmi.segment_id]; + else + x->encode_breakout = cpi->oxcf.encode_breakout; + +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->sf.RD) + { + inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); + } + else +#endif + inter_error = vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); + + + cpi->prediction_error += inter_error; + cpi->intra_error += intra_error; + +#if 0 + // Experimental RD code + cpi->frame_distortion += distortion; + cpi->last_mb_distortion = distortion; +#endif + + // MB level adjutment to quantizer setup + if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled) + { + // If cyclic update enabled + if (cpi->cyclic_refresh_mode_enabled) + { + // Clear segment_id back to 0 if not coded (last frame 0,0) + if ((xd->mbmi.segment_id == 1) && + ((xd->mbmi.ref_frame != LAST_FRAME) || (xd->mbmi.mode != ZEROMV))) + { + xd->mbmi.segment_id = 0; + } + } + + // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise + if (cpi->zbin_mode_boost_enabled) + { + if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame != LAST_FRAME)) + cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; + else + cpi->zbin_mode_boost = 0; + } + + vp8cx_mb_init_quantizer(cpi, x); + } + + cpi->count_mb_ref_frame_usage[xd->mbmi.ref_frame] ++; + + if (xd->mbmi.ref_frame == INTRA_FRAME) + { + x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; + + vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x); + + if (xd->mbmi.mode == B_PRED) + { + vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x); + } + else + { + vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x); + } + + sum_intra_stats(cpi, x); + } + else + { + MV best_ref_mv; + MV nearest, nearby; + int mdcounts[4]; + + vp8_find_near_mvs(xd, xd->mode_info_context, + &nearest, &nearby, &best_ref_mv, mdcounts, xd->mbmi.ref_frame, cpi->common.ref_frame_sign_bias); + + vp8_build_uvmvs(xd, cpi->common.full_pixel); + + // store motion vectors in our motion vector list + if (xd->mbmi.ref_frame == LAST_FRAME) + { + // Set up pointers for this macro block into the previous frame recon buffer + xd->pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset; + xd->pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset; + xd->pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset; + } + else if (xd->mbmi.ref_frame == GOLDEN_FRAME) + { + // Set up pointers for this macro block into the golden frame recon buffer + xd->pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset; + xd->pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset; + xd->pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset; + } + else + { + // Set up pointers for this macro block into the alternate reference frame recon buffer + xd->pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset; + xd->pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset; + xd->pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset; + } + + if (xd->mbmi.mode == SPLITMV) + { + int i; + + for (i = 0; i < 16; i++) + { + if (xd->block[i].bmi.mode == NEW4X4) + { + cpi->MVcount[0][mv_max+((xd->block[i].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++; + cpi->MVcount[1][mv_max+((xd->block[i].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++; + } + } + } + else if (xd->mbmi.mode == NEWMV) + { + cpi->MVcount[0][mv_max+((xd->block[0].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++; + cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++; + } + + if (!x->skip && !x->e_mbd.mbmi.force_no_skip) + { + vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x); + + // Clear mb_skip_coeff if mb_no_coeff_skip is not set + if (!cpi->common.mb_no_coeff_skip) + xd->mbmi.mb_skip_coeff = 0; + + } + else + vp8_stuff_inter16x16(x); + } + + if (!x->skip) + vp8_tokenize_mb(cpi, xd, t); + else + { + if (cpi->common.mb_no_coeff_skip) + { + if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV) + xd->mbmi.dc_diff = 0; + else + xd->mbmi.dc_diff = 1; + + xd->mbmi.mb_skip_coeff = 1; + cpi->skip_true_count ++; + vp8_fix_contexts(cpi, xd); + } + else + { + vp8_stuff_mb(cpi, xd, t); + xd->mbmi.mb_skip_coeff = 0; + cpi->skip_false_count ++; + } + } + + return rate; +} diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c new file mode 100644 index 000000000..403d0204a --- /dev/null +++ b/vp8/encoder/encodeintra.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "idct.h" +#include "quantize.h" +#include "reconintra.h" +#include "reconintra4x4.h" +#include "encodemb.h" +#include "invtrans.h" +#include "recon.h" +#include "dct.h" +#include "g_common.h" +#include "encodeintra.h" + +#define intra4x4ibias_rate 128 +#define intra4x4pbias_rate 256 + + +void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode) +{ + if (i < 12) + { + abmode[i+4] = best_mode; + } + + if ((i & 3) != 3) + { + lbmode[i+1] = best_mode; + } + +} +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif +void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode) +{ + vp8_predict_intra4x4(b, best_mode, b->predictor); + + ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); + + x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); + + x->quantize_b(be, b); + + x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob); + + vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32); + + RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} + +void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode) +{ + vp8_predict_intra4x4(b, best_mode, b->predictor); + + ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); + + x->short_fdct4x4rd(be->src_diff, be->coeff, 32); + + x->quantize_brd(be, b); + + x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob); + + IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32); + + RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); +} + +void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) +{ + int i; + + MACROBLOCKD *x = &mb->e_mbd; + vp8_intra_prediction_down_copy(x); + + for (i = 0; i < 16; i++) + { + BLOCK *be = &mb->block[i]; + BLOCKD *b = &x->block[i]; + + vp8_encode_intra4x4block(rtcd, mb, be, b, b->bmi.mode); + } + + return; +} + +void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + int b; + + vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + + vp8_transform_intra_mby(x); + + vp8_quantize_mby(x); + +#if !(CONFIG_REALTIME_ONLY) +#if 1 + + if (x->optimize && x->rddiv > 1) + vp8_optimize_mby(x, rtcd); + +#endif +#endif + + vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd); + + // make sure block modes are set the way we want them for context updates + for (b = 0; b < 16; b++) + { + BLOCKD *d = &x->e_mbd.block[b]; + + switch (x->e_mbd.mbmi.mode) + { + + case DC_PRED: + d->bmi.mode = B_DC_PRED; + break; + case V_PRED: + d->bmi.mode = B_VE_PRED; + break; + case H_PRED: + d->bmi.mode = B_HE_PRED; + break; + case TM_PRED: + d->bmi.mode = B_TM_PRED; + break; + default: + d->bmi.mode = B_DC_PRED; + break; + + } + } +} + +void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + int b; + + vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + + vp8_transform_intra_mbyrd(x); + + x->e_mbd.mbmi.mb_skip_coeff = 1; + + vp8_quantize_mbyrd(x); + + + vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd); + + // make sure block modes are set the way we want them for context updates + for (b = 0; b < 16; b++) + { + BLOCKD *d = &x->e_mbd.block[b]; + + switch (x->e_mbd.mbmi.mode) + { + + case DC_PRED: + d->bmi.mode = B_DC_PRED; + break; + case V_PRED: + d->bmi.mode = B_VE_PRED; + break; + case H_PRED: + d->bmi.mode = B_HE_PRED; + break; + case TM_PRED: + d->bmi.mode = B_TM_PRED; + break; + default: + d->bmi.mode = B_DC_PRED; + break; + + } + } +} + +void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_intra_predictors_mbuv(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + + vp8_transform_mbuv(x); + + vp8_quantize_mbuv(x); + +#if !(CONFIG_REALTIME_ONLY) +#if 1 + + if (x->optimize && x->rddiv > 1) + vp8_optimize_mbuv(x, rtcd); + +#endif +#endif + + vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); +} + +void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_intra_predictors_mbuv(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + + vp8_transform_mbuvrd(x); + + vp8_quantize_mbuvrd(x); + + + + vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); +} diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h new file mode 100644 index 000000000..4a43ab275 --- /dev/null +++ b/vp8/encoder/encodeintra.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef _ENCODEINTRA_H_ +#define _ENCODEINTRA_H_ +#include "onyx_int.h" + +void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x); +void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x); +void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb); +void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode); +void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode); +void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode); +void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x); +void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x); + +#endif diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c new file mode 100644 index 000000000..d82513318 --- /dev/null +++ b/vp8/encoder/encodemb.c @@ -0,0 +1,1129 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "encodemb.h" +#include "reconinter.h" +#include "quantize.h" +#include "invtrans.h" +#include "recon.h" +#include "reconintra.h" +#include "dct.h" +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif +void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *src_ptr = (*(be->base_src) + be->src); + short *diff_ptr = be->src_diff; + unsigned char *pred_ptr = bd->predictor; + int src_stride = be->src_stride; + + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + diff_ptr[c] = src_ptr[c] - pred_ptr[c]; + } + + diff_ptr += pitch; + pred_ptr += pitch; + src_ptr += src_stride; + } +} + +void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +{ + short *udiff = diff + 256; + short *vdiff = diff + 320; + unsigned char *upred = pred + 256; + unsigned char *vpred = pred + 320; + + int r, c; + + for (r = 0; r < 8; r++) + { + for (c = 0; c < 8; c++) + { + udiff[c] = usrc[c] - upred[c]; + } + + udiff += 8; + upred += 8; + usrc += stride; + } + + for (r = 0; r < 8; r++) + { + for (c = 0; c < 8; c++) + { + vdiff[c] = vsrc[c] - vpred[c]; + } + + vdiff += 8; + vpred += 8; + vsrc += stride; + } +} + +void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) +{ + int r, c; + + for (r = 0; r < 16; r++) + { + for (c = 0; c < 16; c++) + { + diff[c] = src[c] - pred[c]; + } + + diff += 16; + pred += 16; + src += stride; + } +} + +static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); +} + +void vp8_build_dcblock(MACROBLOCK *x) +{ + short *src_diff_ptr = &x->src_diff[384]; + int i; + + for (i = 0; i < 16; i++) + { + src_diff_ptr[i] = x->coeff[i * 16]; + } +} + +void vp8_transform_mbuv(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i += 2) + { + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } +} + +void vp8_transform_mbuvrd(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i += 2) + { + x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } +} + +void vp8_transform_intra_mby(MACROBLOCK *x) +{ + int i; + + for (i = 0; i < 16; i += 2) + { + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + // build dc block from 16 y dc values + vp8_build_dcblock(x); + + // do 2nd order transform on the dc block + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + +} + +void vp8_transform_intra_mbyrd(MACROBLOCK *x) +{ + int i; + + for (i = 0; i < 16; i += 2) + { + x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + // build dc block from 16 y dc values + vp8_build_dcblock(x); + + // do 2nd order transform on the dc block + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); +} + +void vp8_transform_mb(MACROBLOCK *x) +{ + int i; + + for (i = 0; i < 16; i += 2) + { + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + // build dc block from 16 y dc values + if (x->e_mbd.mbmi.mode != SPLITMV) + vp8_build_dcblock(x); + + for (i = 16; i < 24; i += 2) + { + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } + + // do 2nd order transform on the dc block + if (x->e_mbd.mbmi.mode != SPLITMV) + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + +} + +void vp8_transform_mby(MACROBLOCK *x) +{ + int i; + + for (i = 0; i < 16; i += 2) + { + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + // build dc block from 16 y dc values + if (x->e_mbd.mbmi.mode != SPLITMV) + { + vp8_build_dcblock(x); + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + } +} + +void vp8_transform_mbrd(MACROBLOCK *x) +{ + int i; + + for (i = 0; i < 16; i += 2) + { + x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + // build dc block from 16 y dc values + if (x->e_mbd.mbmi.mode != SPLITMV) + vp8_build_dcblock(x); + + for (i = 16; i < 24; i += 2) + { + x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } + + // do 2nd order transform on the dc block + if (x->e_mbd.mbmi.mode != SPLITMV) + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); +} + +void vp8_stuff_inter16x16(MACROBLOCK *x) +{ + vp8_build_inter_predictors_mb_s(&x->e_mbd); + /* + // recon = copy from predictors to destination + { + BLOCKD *b = &x->e_mbd.block[0]; + unsigned char *pred_ptr = b->predictor; + unsigned char *dst_ptr = *(b->base_dst) + b->dst; + int stride = b->dst_stride; + + int i; + for(i=0;i<16;i++) + vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16); + + b = &x->e_mbd.block[16]; + pred_ptr = b->predictor; + dst_ptr = *(b->base_dst) + b->dst; + stride = b->dst_stride; + + for(i=0;i<8;i++) + vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); + + b = &x->e_mbd.block[20]; + pred_ptr = b->predictor; + dst_ptr = *(b->base_dst) + b->dst; + stride = b->dst_stride; + + for(i=0;i<8;i++) + vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); + } + */ +} + +#if !(CONFIG_REALTIME_ONLY) +extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2]; +extern const TOKENEXTRA *vp8_dct_value_tokens_ptr; +extern int vp8_dct_value_cost[DCT_MAX_VALUE*2]; +extern int *vp8_dct_value_cost_ptr; + +static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) +{ + int c = !type; /* start at coef 0, unless Y with Y2 */ + int eob = b->eob; + int pt ; /* surrounding block/prev coef predictor */ + int cost = 0; + short *qcoeff_ptr = b->qcoeff; + + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + +# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] ) + + for (; c < eob; c++) + { + int v = QC(c); + int t = vp8_dct_value_tokens_ptr[v].Token; + cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t]; + cost += vp8_dct_value_cost_ptr[v]; + pt = vp8_prev_token_class[t]; + } + +# undef QC + + if (c < 16) + cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN]; + + return cost; +} + +static int mbycost_coeffs(MACROBLOCK *mb) +{ + int cost = 0; + int b; + TEMP_CONTEXT t; + int type = 0; + + MACROBLOCKD *x = &mb->e_mbd; + + vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4); + + if (x->mbmi.mode == SPLITMV) + type = 3; + + for (b = 0; b < 16; b++) + cost += cost_coeffs(mb, x->block + b, type, + t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + + return cost; +} + +#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) + +void vp8_optimize_b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd) +{ + BLOCK *b = &x->block[i]; + BLOCKD *bd = &x->e_mbd.block[i]; + short *dequant_ptr = &bd->dequant[0][0]; + int nzpos[16] = {0}; + short saved_qcoefs[16]; + short saved_dqcoefs[16]; + int baserate, baseerror, baserd; + int rate, error, thisrd; + int k; + int nzcoefcount = 0; + int nc, bestnc = 0; + int besteob; + + // count potential coefficient to be optimized + for (k = !type; k < 16; k++) + { + int qcoef = abs(bd->qcoeff[k]); + int coef = abs(b->coeff[k]); + int dq = dequant_ptr[k]; + + if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq)) + { + nzpos[nzcoefcount] = k; + nzcoefcount++; + } + } + + // if nothing here, do nothing for this block. + if (!nzcoefcount) + { + *a = *l = (bd->eob != !type); + return; + } + + // save a copy of quantized coefficients + vpx_memcpy(saved_qcoefs, bd->qcoeff, 32); + vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32); + + besteob = bd->eob; + baserate = cost_coeffs(x, bd, type, a, l); + baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2; + baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100); + + for (nc = 1; nc < (1 << nzcoefcount); nc++) + { + //reset coefficients + vpx_memcpy(bd->qcoeff, saved_qcoefs, 32); + vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32); + + for (k = 0; k < nzcoefcount; k++) + { + int pos = nzpos[k]; + + if ((nc & (1 << k))) + { + int cur_qcoef = bd->qcoeff[pos]; + + if (cur_qcoef < 0) + { + bd->qcoeff[pos]++; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + else + { + bd->qcoeff[pos]--; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + } + } + + { + int eob = -1; + int rc; + int m; + + for (m = 0; m < 16; m++) + { + rc = vp8_default_zig_zag1d[m]; + + if (bd->qcoeff[rc]) + eob = m; + } + + bd->eob = eob + 1; + } + + rate = cost_coeffs(x, bd, type, a, l); + error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2; + thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100); + + if (thisrd < baserd) + { + baserd = thisrd; + bestnc = nc; + besteob = bd->eob; + } + } + + //reset coefficients + vpx_memcpy(bd->qcoeff, saved_qcoefs, 32); + vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32); + + if (bestnc) + { + for (k = 0; k < nzcoefcount; k++) + { + int pos = nzpos[k]; + + if (bestnc & (1 << k)) + { + int cur_qcoef = bd->qcoeff[pos]; + + if (cur_qcoef < 0) + { + bd->qcoeff[pos]++; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + else + { + bd->qcoeff[pos]--; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + } + } + +#if 0 + { + int eob = -1; + int rc; + int m; + + for (m = 0; m < 16; m++) + { + rc = vp8_default_zig_zag1d[m]; + + if (bd->qcoeff[rc]) + eob = m; + } + + bd->eob = eob + 1; + } +#endif + } + +#if 1 + bd->eob = besteob; +#endif +#if 0 + { + int eob = -1; + int rc; + int m; + + for (m = 0; m < 16; m++) + { + rc = vp8_default_zig_zag1d[m]; + + if (bd->qcoeff[rc]) + eob = m; + } + + bd->eob = eob + 1; + } + +#endif + *a = *l = (bd->eob != !type); + return; +} + +void vp8_optimize_bplus(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd) +{ + BLOCK *b = &x->block[i]; + BLOCKD *bd = &x->e_mbd.block[i]; + short *dequant_ptr = &bd->dequant[0][0]; + int nzpos[16] = {0}; + short saved_qcoefs[16]; + short saved_dqcoefs[16]; + int baserate, baseerror, baserd; + int rate, error, thisrd; + int k; + int nzcoefcount = 0; + int nc, bestnc = 0; + int besteob; + + // count potential coefficient to be optimized + for (k = !type; k < 16; k++) + { + int qcoef = abs(bd->qcoeff[k]); + int coef = abs(b->coeff[k]); + int dq = dequant_ptr[k]; + + if (qcoef && (qcoef * dq < coef) && (coef < (qcoef * dq + dq))) + { + nzpos[nzcoefcount] = k; + nzcoefcount++; + } + } + + // if nothing here, do nothing for this block. + if (!nzcoefcount) + { + //do not update context, we need do the other half. + //*a = *l = (bd->eob != !type); + return; + } + + // save a copy of quantized coefficients + vpx_memcpy(saved_qcoefs, bd->qcoeff, 32); + vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32); + + besteob = bd->eob; + baserate = cost_coeffs(x, bd, type, a, l); + baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2; + baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100); + + for (nc = 1; nc < (1 << nzcoefcount); nc++) + { + //reset coefficients + vpx_memcpy(bd->qcoeff, saved_qcoefs, 32); + vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32); + + for (k = 0; k < nzcoefcount; k++) + { + int pos = nzpos[k]; + + if ((nc & (1 << k))) + { + int cur_qcoef = bd->qcoeff[pos]; + + if (cur_qcoef < 0) + { + bd->qcoeff[pos]--; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + else + { + bd->qcoeff[pos]++; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + } + } + + { + int eob = -1; + int rc; + int m; + + for (m = 0; m < 16; m++) + { + rc = vp8_default_zig_zag1d[m]; + + if (bd->qcoeff[rc]) + eob = m; + } + + bd->eob = eob + 1; + } + + rate = cost_coeffs(x, bd, type, a, l); + error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2; + thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100); + + if (thisrd < baserd) + { + baserd = thisrd; + bestnc = nc; + besteob = bd->eob; + } + } + + //reset coefficients + vpx_memcpy(bd->qcoeff, saved_qcoefs, 32); + vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32); + + if (bestnc) + { + for (k = 0; k < nzcoefcount; k++) + { + int pos = nzpos[k]; + + if (bestnc & (1 << k)) + { + int cur_qcoef = bd->qcoeff[pos]; + + if (cur_qcoef < 0) + { + bd->qcoeff[pos]++; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + else + { + bd->qcoeff[pos]--; + bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos]; + } + } + } + } + + bd->eob = besteob; + //do not update context, we need do the other half. + //*a = *l = (bd->eob != !type); + return; +} + +void vp8_optimize_y2b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd) +{ + + BLOCK *b = &x->block[i]; + BLOCKD *bd = &x->e_mbd.block[i]; + short *dequant_ptr = &bd->dequant[0][0]; + + int baserate, baseerror, baserd; + int rate, error, thisrd; + int k; + + if (bd->eob == 0) + return; + + baserate = cost_coeffs(x, bd, type, a, l); + baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4; + baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100); + + for (k = 0; k < 16; k++) + { + int cur_qcoef = bd->qcoeff[k]; + + if (!cur_qcoef) + continue; + + if (cur_qcoef < 0) + { + bd->qcoeff[k]++; + bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k]; + } + else + { + bd->qcoeff[k]--; + bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k]; + } + + if (bd->qcoeff[k] == 0) + { + int eob = -1; + int rc; + int l; + + for (l = 0; l < 16; l++) + { + rc = vp8_default_zig_zag1d[l]; + + if (bd->qcoeff[rc]) + eob = l; + } + + bd->eob = eob + 1; + } + + rate = cost_coeffs(x, bd, type, a, l); + error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4; + thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100); + + if (thisrd > baserd) + { + bd->qcoeff[k] = cur_qcoef; + bd->dqcoeff[k] = cur_qcoef * dequant_ptr[k]; + } + else + { + baserd = thisrd; + } + + } + + { + int eob = -1; + int rc; + + for (k = 0; k < 16; k++) + { + rc = vp8_default_zig_zag1d[k]; + + if (bd->qcoeff[rc]) + eob = k; + } + + bd->eob = eob + 1; + } + + return; +} + + +void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) +{ + int cost = 0; + int b; + TEMP_CONTEXT t, t2; + int type = 0; + + vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4); + + if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED) + type = 3; + + for (b = 0; b < 16; b++) + { + //vp8_optimize_bplus(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd); + } + + vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2); + vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2); + + for (b = 16; b < 20; b++) + { + //vp8_optimize_bplus(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd); + } + + for (b = 20; b < 24; b++) + { + //vp8_optimize_bplus(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]); + vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd); + } +} + + + +void vp8_super_slow_yquant_optimization(MACROBLOCK *x, int type, const VP8_ENCODER_RTCD *rtcd) +{ + BLOCK *b = &x->block[0]; + BLOCKD *bd = &x->e_mbd.block[0]; + short *dequant_ptr = &bd->dequant[0][0]; + struct + { + int block; + int pos; + } nzpos[256]; + short saved_qcoefs[256]; + short saved_dqcoefs[256]; + short *coef_ptr = x->coeff; + short *qcoef_ptr = x->e_mbd.qcoeff; + short *dqcoef_ptr = x->e_mbd.dqcoeff; + + int baserate, baseerror, baserd; + int rate, error, thisrd; + int i, k; + int nzcoefcount = 0; + int nc, bestnc = 0; + int besteob; + + //this code has assumption in macroblock coeff buffer layout + for (i = 0; i < 16; i++) + { + // count potential coefficient to be optimized + for (k = !type; k < 16; k++) + { + int qcoef = abs(qcoef_ptr[i*16 + k]); + int coef = abs(coef_ptr[i*16 + k]); + int dq = dequant_ptr[k]; + + if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq)) + { + nzpos[nzcoefcount].block = i; + nzpos[nzcoefcount].pos = k; + nzcoefcount++; + } + } + } + + // if nothing here, do nothing for this macro_block. + if (!nzcoefcount || nzcoefcount > 15) + { + return; + } + + /****************************************************************************** + looking from each coeffient's perspective, each identifed coefficent above could + have 2 values:roundeddown(x) and roundedup(x). Therefore the total number of + different states is less than 2**nzcoefcount. + ******************************************************************************/ + // save the qunatized coefficents and dequantized coefficicents + vpx_memcpy(saved_qcoefs, x->e_mbd.qcoeff, 256); + vpx_memcpy(saved_dqcoefs, x->e_mbd.dqcoeff, 256); + + baserate = mbycost_coeffs(x); + baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type); + baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100); + + for (nc = 1; nc < (1 << nzcoefcount); nc++) + { + //reset coefficients + vpx_memcpy(x->e_mbd.qcoeff, saved_qcoefs, 256); + vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256); + + for (k = 0; k < nzcoefcount; k++) + { + int bk = nzpos[k].block; + int pos = nzpos[k].pos; + int mbkpos = bk * 16 + pos; + + if ((nc & (1 << k))) + { + int cur_qcoef = x->e_mbd.qcoeff[mbkpos]; + + if (cur_qcoef < 0) + { + x->e_mbd.qcoeff[mbkpos]++; + x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos]; + } + else + { + x->e_mbd.qcoeff[mbkpos]--; + x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos]; + } + } + } + + for (i = 0; i < 16; i++) + { + BLOCKD *bd = &x->e_mbd.block[i]; + { + int eob = -1; + int rc; + int l; + + for (l = 0; l < 16; l++) + { + rc = vp8_default_zig_zag1d[l]; + + if (bd->qcoeff[rc]) + eob = l; + } + + bd->eob = eob + 1; + } + } + + rate = mbycost_coeffs(x); + error = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type);; + thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100); + + if (thisrd < baserd) + { + baserd = thisrd; + bestnc = nc; + besteob = bd->eob; + } + } + + //reset coefficients + vpx_memcpy(x->e_mbd.qcoeff, saved_qcoefs, 256); + vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256); + + if (bestnc) + { + for (k = 0; k < nzcoefcount; k++) + { + int bk = nzpos[k].block; + int pos = nzpos[k].pos; + int mbkpos = bk * 16 + pos; + + if ((nc & (1 << k))) + { + int cur_qcoef = x->e_mbd.qcoeff[mbkpos]; + + if (cur_qcoef < 0) + { + x->e_mbd.qcoeff[mbkpos]++; + x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos]; + } + else + { + x->e_mbd.qcoeff[mbkpos]--; + x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos]; + } + } + } + } + + for (i = 0; i < 16; i++) + { + BLOCKD *bd = &x->e_mbd.block[i]; + { + int eob = -1; + int rc; + int l; + + for (l = 0; l < 16; l++) + { + rc = vp8_default_zig_zag1d[l]; + + if (bd->qcoeff[rc]) + eob = l; + } + + bd->eob = eob + 1; + } + } + + return; +} + +static void vp8_find_mb_skip_coef(MACROBLOCK *x) +{ + int i; + + x->e_mbd.mbmi.mb_skip_coeff = 1; + + if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV) + { + for (i = 0; i < 16; i++) + { + x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2); + } + + for (i = 16; i < 25; i++) + { + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } + else + { + for (i = 0; i < 24; i++) + { + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } +} + + +void vp8_optimize_mb_slow(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) +{ + int cost = 0; + int b; + TEMP_CONTEXT t, t2; + int type = 0; + + + vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4); + + if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED) + type = 3; + + vp8_super_slow_yquant_optimization(x, type, rtcd); + /* + for(b=0;b<16;b++) + { + vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + } + */ + + vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2); + + for (b = 16; b < 20; b++) + { + vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd); + } + + vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2); + + for (b = 20; b < 24; b++) + { + vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd); + } +} + + +void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) +{ + int cost = 0; + int b; + TEMP_CONTEXT t; + int type = 0; + + if (!x->e_mbd.above_context[Y1CONTEXT]) + return; + + if (!x->e_mbd.left_context[Y1CONTEXT]) + return; + + vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4); + + if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED) + type = 3; + + for (b = 0; b < 16; b++) + { + vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd); + } + +} + +void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) +{ + int cost = 0; + int b; + TEMP_CONTEXT t, t2; + int type = 0; + + if (!x->e_mbd.above_context[UCONTEXT]) + return; + + if (!x->e_mbd.left_context[UCONTEXT]) + return; + + if (!x->e_mbd.above_context[VCONTEXT]) + return; + + if (!x->e_mbd.left_context[VCONTEXT]) + return; + + + vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2); + vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2); + + for (b = 16; b < 20; b++) + { + vp8_optimize_b(x, b, vp8_block2type[b], + t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd); + + } + + for (b = 20; b < 24; b++) + { + vp8_optimize_b(x, b, vp8_block2type[b], + t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd); + } + +} +#endif + +void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_inter_predictors_mb(&x->e_mbd); + + vp8_subtract_mb(rtcd, x); + + vp8_transform_mb(x); + + vp8_quantize_mb(x); + +#if !(CONFIG_REALTIME_ONLY) +#if 1 + + if (x->optimize && x->rddiv > 1) + { + vp8_optimize_mb(x, rtcd); + vp8_find_mb_skip_coef(x); + } + +#endif +#endif + + vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd); +} + + +/* this funciton is used by first pass only */ +void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_inter_predictors_mby(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + + vp8_transform_mby(x); + + vp8_quantize_mby(x); + + vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd); +} + + +void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_inter_predictors_mbuv(&x->e_mbd); + + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + + vp8_transform_mbuv(x); + + vp8_quantize_mbuv(x); + + vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + + vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); +} + + +void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) +{ + vp8_build_inter_predictors_mbuv(&x->e_mbd); + ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); + + vp8_transform_mbuvrd(x); + + vp8_quantize_mbuvrd(x); + +} diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h new file mode 100644 index 000000000..91ca8f552 --- /dev/null +++ b/vp8/encoder/encodemb.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ENCODEMB_H +#define __INC_ENCODEMB_H + +#include "vpx_ports/config.h" +#include "block.h" + +#define prototype_mberr(sym) \ + int (sym)(MACROBLOCK *mb, int dc) + +#define prototype_berr(sym) \ + int (sym)(short *coeff, short *dqcoeff) + +#define prototype_mbuverr(sym) \ + int (sym)(MACROBLOCK *mb) + +#define prototype_subb(sym) \ + void (sym)(BLOCK *be,BLOCKD *bd, int pitch) + +#define prototype_submby(sym) \ + void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride) + +#define prototype_submbuv(sym) \ + void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\ + unsigned char *pred, int stride) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/encodemb_x86.h" +#endif + +#if ARCH_ARM +#include "arm/encodemb_arm.h" +#endif + +#ifndef vp8_encodemb_berr +#define vp8_encodemb_berr vp8_block_error_c +#endif +extern prototype_berr(vp8_encodemb_berr); + +#ifndef vp8_encodemb_mberr +#define vp8_encodemb_mberr vp8_mbblock_error_c +#endif +extern prototype_mberr(vp8_encodemb_mberr); + +#ifndef vp8_encodemb_mbuverr +#define vp8_encodemb_mbuverr vp8_mbuverror_c +#endif +extern prototype_mbuverr(vp8_encodemb_mbuverr); + +#ifndef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_c +#endif +extern prototype_subb(vp8_encodemb_subb); + +#ifndef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_c +#endif +extern prototype_submby(vp8_encodemb_submby); + +#ifndef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_c +#endif +extern prototype_submbuv(vp8_encodemb_submbuv); + + +typedef struct +{ + prototype_berr(*berr); + prototype_mberr(*mberr); + prototype_mbuverr(*mbuverr); + prototype_subb(*subb); + prototype_submby(*submby); + prototype_submbuv(*submbuv); +} vp8_encodemb_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn +#else +#define ENCODEMB_INVOKE(ctx,fn) vp8_encodemb_##fn +#endif + + + +#include "onyx_int.h" +struct VP8_ENCODER_RTCD; +void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x); + +extern void vp8_stuff_inter16x16(MACROBLOCK *x); + +void vp8_build_dcblock(MACROBLOCK *b); +void vp8_transform_mb(MACROBLOCK *mb); +void vp8_transform_mbuv(MACROBLOCK *x); +void vp8_transform_mbuvrd(MACROBLOCK *x); +void vp8_transform_intra_mby(MACROBLOCK *x); +void vp8_transform_intra_mbyrd(MACROBLOCK *x); +void Encode16x16Y(MACROBLOCK *x); +void Encode16x16UV(MACROBLOCK *x); +void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x); +void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x); +void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd); +void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd); +void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x); +#endif diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c new file mode 100644 index 000000000..f287edc18 --- /dev/null +++ b/vp8/encoder/encodemv.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "common.h" +#include "encodemv.h" +#include "entropymode.h" +#include "systemdependent.h" + +#include <math.h> + +#ifdef ENTROPY_STATS +extern unsigned int active_section; +#endif + +static void encode_mvcomponent( + vp8_writer *const w, + const int v, + const struct mv_context *mvc +) +{ + const vp8_prob *p = mvc->prob; + const int x = v < 0 ? -v : v; + + if (x < mvnum_short) // Small + { + vp8_write(w, 0, p [mvpis_short]); + vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3); + + if (!x) + return; // no sign bit + } + else // Large + { + int i = 0; + + vp8_write(w, 1, p [mvpis_short]); + + do + vp8_write(w, (x >> i) & 1, p [MVPbits + i]); + + while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do + vp8_write(w, (x >> i) & 1, p [MVPbits + i]); + + while (--i > 3); + + if (x & 0xFFF0) + vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]); + } + + vp8_write(w, v < 0, p [MVPsign]); +} +#if 0 +static int max_mv_r = 0; +static int max_mv_c = 0; +#endif +void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc) +{ + +#if 0 + { + if (abs(mv->row >> 1) > max_mv_r) + { + FILE *f = fopen("maxmv.stt", "a"); + max_mv_r = abs(mv->row >> 1); + fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1)); + + if ((abs(mv->row) / 2) != max_mv_r) + fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2); + + fclose(f); + } + + if (abs(mv->col >> 1) > max_mv_c) + { + FILE *f = fopen("maxmv.stt", "a"); + fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1)); + max_mv_c = abs(mv->col >> 1); + fclose(f); + } + } +#endif + + encode_mvcomponent(w, mv->row >> 1, &mvc[0]); + encode_mvcomponent(w, mv->col >> 1, &mvc[1]); +} + + +static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc) +{ + const vp8_prob *p = mvc->prob; + const int x = v; //v<0? -v:v; + unsigned int cost; + + if (x < mvnum_short) + { + cost = vp8_cost_zero(p [mvpis_short]) + + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3); + + if (!x) + return cost; + } + else + { + int i = 0; + cost = vp8_cost_one(p [mvpis_short]); + + do + cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1); + + while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do + cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1); + + while (--i > 3); + + if (x & 240) + cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1); + } + + return cost; // + vp8_cost_bit( p [MVPsign], v < 0); +} +//#define M_LOG2_E 0.693147180559945309417 +//#define log2f(x) (log (x) / (float) M_LOG2_E) + +void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]) +{ + int i = 1; //-mv_max; + unsigned int cost0 = 0; + unsigned int cost1 = 0; + + vp8_clear_system_state(); +#if 0 + mvsadcost [0] [0] = 300; + mvsadcost [1] [0] = 300; + + do + { + double z = 256 * (2 * (log2f(2 * i) + .6)); + mvsadcost [0][i] = (int) z; + mvsadcost [1][i] = (int) z; + mvsadcost [0][-i] = (int) z; + mvsadcost [1][-i] = (int) z; + } + while (++i <= mv_max); + +#endif + + i = 1; + + if (mvc_flag[0]) + { + mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]); + + do + { + //mvcost [0] [i] = cost_mvcomponent( i, &mvc[0]); + cost0 = cost_mvcomponent(i, &mvc[0]); + + mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]); + mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]); + } + while (++i <= mv_max); + } + + i = 1; + + if (mvc_flag[1]) + { + mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]); + + do + { + //mvcost [1] [i] = cost_mvcomponent( i, mvc[1]); + cost1 = cost_mvcomponent(i, &mvc[1]); + + mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]); + mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]); + } + while (++i <= mv_max); + } + + /* + i=-mv_max; + do + { + mvcost [0] [i] = cost_mvcomponent( i, mvc[0]); + mvcost [1] [i] = cost_mvcomponent( i, mvc[1]); + } + while( ++i <= mv_max); + */ +} + + +// Motion vector probability table update depends on benefit. +// Small correction allows for the fact that an update to an MV probability +// may have benefit in subsequent frames as well as the current one. + +#define MV_PROB_UPDATE_CORRECTION -1 + + +__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2]) +{ + const unsigned int tot = ct[0] + ct[1]; + + if (tot) + { + const vp8_prob x = ((ct[0] * 255) / tot) & -2; + *p = x ? x : 1; + } +} + +static void update( + vp8_writer *const w, + const unsigned int ct[2], + vp8_prob *const cur_p, + const vp8_prob new_p, + const vp8_prob update_p, + int *updated +) +{ + const int cur_b = vp8_cost_branch(ct, *cur_p); + const int new_b = vp8_cost_branch(ct, new_p); + const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8); + + if (cur_b - new_b > cost) + { + *cur_p = new_p; + vp8_write(w, 1, update_p); + vp8_write_literal(w, new_p >> 1, 7); + *updated = 1; + + } + else + vp8_write(w, 0, update_p); +} + +static void write_component_probs( + vp8_writer *const w, + struct mv_context *cur_mvc, + const struct mv_context *default_mvc_, + const struct mv_context *update_mvc, + const unsigned int events [MVvals], + unsigned int rc, + int *updated +) +{ + vp8_prob *Pcur = cur_mvc->prob; + const vp8_prob *default_mvc = default_mvc_->prob; + const vp8_prob *Pupdate = update_mvc->prob; + unsigned int is_short_ct[2], sign_ct[2]; + + unsigned int bit_ct [mvlong_width] [2]; + + unsigned int short_ct [mvnum_short]; + unsigned int short_bct [mvnum_short-1] [2]; + + vp8_prob Pnew [MVPcount]; + + (void) rc; + vp8_copy_array(Pnew, default_mvc, MVPcount); + + vp8_zero(is_short_ct) + vp8_zero(sign_ct) + vp8_zero(bit_ct) + vp8_zero(short_ct) + vp8_zero(short_bct) + + + //j=0 + { + int j = 0; + + const int c = events [mv_max]; + + is_short_ct [0] += c; // Short vector + short_ct [0] += c; // Magnitude distribution + } + + //j: 1 ~ mv_max (1023) + { + int j = 1; + + do + { + const int c1 = events [mv_max + j]; //positive + const int c2 = events [mv_max - j]; //negative + const int c = c1 + c2; + int a = j; + + sign_ct [0] += c1; + sign_ct [1] += c2; + + if (a < mvnum_short) + { + is_short_ct [0] += c; // Short vector + short_ct [a] += c; // Magnitude distribution + } + else + { + int k = mvlong_width - 1; + is_short_ct [1] += c; // Long vector + + /* bit 3 not always encoded. */ + do + bit_ct [k] [(a >> k) & 1] += c; + + while (--k >= 0); + } + } + while (++j <= mv_max); + } + + /* + { + int j = -mv_max; + do + { + + const int c = events [mv_max + j]; + int a = j; + + if( j < 0) + { + sign_ct [1] += c; + a = -j; + } + else if( j) + sign_ct [0] += c; + + if( a < mvnum_short) + { + is_short_ct [0] += c; // Short vector + short_ct [a] += c; // Magnitude distribution + } + else + { + int k = mvlong_width - 1; + is_short_ct [1] += c; // Long vector + + // bit 3 not always encoded. + + do + bit_ct [k] [(a >> k) & 1] += c; + while( --k >= 0); + } + } while( ++j <= mv_max); + } + */ + + calc_prob(Pnew + mvpis_short, is_short_ct); + + calc_prob(Pnew + MVPsign, sign_ct); + + { + vp8_prob p [mvnum_short - 1]; /* actually only need branch ct */ + int j = 0; + + vp8_tree_probs_from_distribution( + 8, vp8_small_mvencodings, vp8_small_mvtree, + p, short_bct, short_ct, + 256, 1 + ); + + do + calc_prob(Pnew + MVPshort + j, short_bct[j]); + + while (++j < mvnum_short - 1); + } + + { + int j = 0; + + do + calc_prob(Pnew + MVPbits + j, bit_ct[j]); + + while (++j < mvlong_width); + } + + update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated); + + update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated); + + { + const vp8_prob *const new_p = Pnew + MVPshort; + vp8_prob *const cur_p = Pcur + MVPshort; + + int j = 0; + + do + + update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated); + + while (++j < mvnum_short - 1); + } + + { + const vp8_prob *const new_p = Pnew + MVPbits; + vp8_prob *const cur_p = Pcur + MVPbits; + + int j = 0; + + do + + update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated); + + while (++j < mvlong_width); + } +} + +void vp8_write_mvprobs(VP8_COMP *cpi) +{ + vp8_writer *const w = & cpi->bc; + MV_CONTEXT *mvc = cpi->common.fc.mvc; + int flags[2] = {0, 0}; +#ifdef ENTROPY_STATS + active_section = 4; +#endif + write_component_probs( + w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0] + ); + write_component_probs( + w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1] + ); + + if (flags[0] || flags[1]) + vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags); + +#ifdef ENTROPY_STATS + active_section = 5; +#endif +} diff --git a/vp8/encoder/encodemv.h b/vp8/encoder/encodemv.h new file mode 100644 index 000000000..1c1f450a0 --- /dev/null +++ b/vp8/encoder/encodemv.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_ENCODEMV_H +#define __INC_ENCODEMV_H + +#include "onyx_int.h" + +void vp8_write_mvprobs(VP8_COMP *); +void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *); +void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]); + +#endif diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c new file mode 100644 index 000000000..a0b50d2a1 --- /dev/null +++ b/vp8/encoder/ethreading.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "onyx_int.h" +#include "threading.h" +#include "common.h" +#include "extend.h" + + +extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset); +extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); +extern void vp8_build_block_offsets(MACROBLOCK *x); +extern void vp8_setup_block_ptrs(MACROBLOCK *x); + +static +THREAD_FUNCTION thread_encoding_proc(void *p_data) +{ +#if CONFIG_MULTITHREAD + int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; + VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); + MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); + ENTROPY_CONTEXT mb_row_left_context[4][4]; + + //printf("Started thread %d\n", ithread); + + while (1) + { + if (cpi->b_multi_threaded == 0) + break; + + //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0) + if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0) + { + if (cpi->b_multi_threaded == FALSE) // we're shutting down + break; + else + { + VP8_COMMON *cm = &cpi->common; + int mb_row = mbri->mb_row; + MACROBLOCK *x = &mbri->mb; + MACROBLOCKD *xd = &x->e_mbd; + TOKENEXTRA **tp = &mbri->tp; + int *segment_counts = mbri->segment_counts; + int *totalrate = &mbri->totalrate; + + { + int i; + int recon_yoffset, recon_uvoffset; + int mb_col; + int recon_y_stride = cm->last_frame.y_stride; + int recon_uv_stride = cm->last_frame.uv_stride; + volatile int *last_row_current_mb_col; + + if (ithread > 0) + last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col; + else + last_row_current_mb_col = &cpi->current_mb_col_main; + + // reset above block coeffs + xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT]; + xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ]; + xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ]; + xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT]; + xd->left_context = mb_row_left_context; + + vp8_zero(mb_row_left_context); + + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + + cpi->tplist[mb_row].start = *tp; + + //printf("Thread mb_row = %d\n", mb_row); + + // for each macroblock col in image + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int seg_map_index = (mb_row * cm->mb_cols); + + while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1) + { + x86_pause_hint(); + thread_sleep(0); + } + + // Distance of Mb to the various image edges. + // These specified to 8th pel as they are always compared to values that are in 1/8th pel units + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3; + + // Set up limit values for motion vectors used to prevent them extending outside the UMV borders + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset; + xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset; + xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + // Is segmentation enabled + // MB level adjutment to quantizer + if (xd->segmentation_enabled) + { + // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking) + if (cpi->segmentation_map[seg_map_index+mb_col] <= 3) + xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col]; + else + xd->mbmi.segment_id = 0; + + vp8cx_mb_init_quantizer(cpi, x); + } + else + xd->mbmi.segment_id = 0; // Set to Segment 0 by default + + + if (cm->frame_type == KEY_FRAME) + { + *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp); +#ifdef MODE_STATS + y_modes[xd->mbmi.mode] ++; +#endif + } + else + { + *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset); + +#ifdef MODE_STATS + inter_y_modes[xd->mbmi.mode] ++; + + if (xd->mbmi.mode == SPLITMV) + { + int b; + + for (b = 0; b < xd->mbmi.partition_count; b++) + { + inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++; + } + } + +#endif + + // Count of last ref frame 0,0 useage + if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME)) + cpi->inter_zz_count ++; + + } + + cpi->tplist[mb_row].stop = *tp; + + xd->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb + + // store macroblock mode info into context array + vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi)); + + for (i = 0; i < 16; i++) + vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi)); + + // adjust to the next column of macroblocks + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + // Keep track of segment useage + segment_counts[xd->mbmi.segment_id] ++; + + // skip to next mb + xd->mode_info_context++; + + xd->above_context[Y1CONTEXT] += 4; + xd->above_context[UCONTEXT ] += 2; + xd->above_context[VCONTEXT ] += 2; + xd->above_context[Y2CONTEXT] ++; + + cpi->mb_row_ei[ithread].current_mb_col = mb_col; + + } + + //extend the recon for intra prediction + vp8_extend_mb_row( + &cm->new_frame, + xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, + xd->dst.v_buffer + 8); + + // this is to account for the border + xd->mode_info_context++; + + x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; + + xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count; + + if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1) + { + //SetEvent(cpi->h_event_main); + sem_post(&cpi->h_event_main); + } + + } + + } + } + } + +#else + (void) p_data; +#endif + + //printf("exit thread %d\n", ithread); + return 0; +} + +static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) +{ + + MACROBLOCK *x = mbsrc; + MACROBLOCK *z = mbdst; + int i; + + z->ss = x->ss; + z->ss_count = x->ss_count; + z->searches_per_step = x->searches_per_step; + z->errorperbit = x->errorperbit; + + z->sadperbit16 = x->sadperbit16; + z->sadperbit4 = x->sadperbit4; + z->errthresh = x->errthresh; + z->rddiv = x->rddiv; + z->rdmult = x->rdmult; + + /* + z->mv_col_min = x->mv_col_min; + z->mv_col_max = x->mv_col_max; + z->mv_row_min = x->mv_row_min; + z->mv_row_max = x->mv_row_max; + z->vector_range = x->vector_range ; + */ + + z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4; + z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4; + z->short_fdct4x4rd = x->short_fdct4x4rd; + z->short_fdct8x4rd = x->short_fdct8x4rd; + z->short_fdct8x4rd = x->short_fdct8x4rd; + z->vp8_short_fdct4x4_ptr = x->vp8_short_fdct4x4_ptr; + z->short_walsh4x4 = x->short_walsh4x4; + z->quantize_b = x->quantize_b; + z->quantize_brd = x->quantize_brd; + + /* + z->mvc = x->mvc; + z->src.y_buffer = x->src.y_buffer; + z->src.u_buffer = x->src.u_buffer; + z->src.v_buffer = x->src.v_buffer; + */ + + + vpx_memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts)); + z->mvcost[0] = &z->mvcosts[0][mv_max+1]; + z->mvcost[1] = &z->mvcosts[1][mv_max+1]; + z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1]; + z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1]; + + + vpx_memcpy(z->token_costs, x->token_costs, sizeof(x->token_costs)); + vpx_memcpy(z->inter_bmode_costs, x->inter_bmode_costs, sizeof(x->inter_bmode_costs)); + //memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts)); + //memcpy(z->mvcost, x->mvcost, sizeof(x->mvcost)); + vpx_memcpy(z->mbmode_cost, x->mbmode_cost, sizeof(x->mbmode_cost)); + vpx_memcpy(z->intra_uv_mode_cost, x->intra_uv_mode_cost, sizeof(x->intra_uv_mode_cost)); + vpx_memcpy(z->bmode_costs, x->bmode_costs, sizeof(x->bmode_costs)); + + for (i = 0; i < 25; i++) + { + z->block[i].quant = x->block[i].quant; + z->block[i].zbin = x->block[i].zbin; + z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost; + z->block[i].round = x->block[i].round; + /* + z->block[i].src = x->block[i].src; + */ + z->block[i].src_stride = x->block[i].src_stride; + z->block[i].force_empty = x->block[i].force_empty; + + } + + { + MACROBLOCKD *xd = &x->e_mbd; + MACROBLOCKD *zd = &z->e_mbd; + + /* + zd->mode_info_context = xd->mode_info_context; + zd->mode_info = xd->mode_info; + + zd->mode_info_stride = xd->mode_info_stride; + zd->frame_type = xd->frame_type; + zd->up_available = xd->up_available ; + zd->left_available = xd->left_available; + zd->left_context = xd->left_context; + zd->last_frame_dc = xd->last_frame_dc; + zd->last_frame_dccons = xd->last_frame_dccons; + zd->gold_frame_dc = xd->gold_frame_dc; + zd->gold_frame_dccons = xd->gold_frame_dccons; + zd->mb_to_left_edge = xd->mb_to_left_edge; + zd->mb_to_right_edge = xd->mb_to_right_edge; + zd->mb_to_top_edge = xd->mb_to_top_edge ; + zd->mb_to_bottom_edge = xd->mb_to_bottom_edge; + zd->gf_active_ptr = xd->gf_active_ptr; + zd->frames_since_golden = xd->frames_since_golden; + zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame; + */ + zd->subpixel_predict = xd->subpixel_predict; + zd->subpixel_predict8x4 = xd->subpixel_predict8x4; + zd->subpixel_predict8x8 = xd->subpixel_predict8x8; + zd->subpixel_predict16x16 = xd->subpixel_predict16x16; + zd->segmentation_enabled = xd->segmentation_enabled; + zd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); + + /* + memcpy(zd->above_context, xd->above_context, sizeof(xd->above_context)); + memcpy(zd->mb_segment_tree_probs, xd->mb_segment_tree_probs, sizeof(xd->mb_segment_tree_probs)); + memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); + */ + for (i = 0; i < 25; i++) + { + zd->block[i].dequant = xd->block[i].dequant; + } + } +} + + +void vp8cx_init_mbrthread_data(VP8_COMP *cpi, + MACROBLOCK *x, + MB_ROW_COMP *mbr_ei, + int mb_row, + int count + ) +{ + + VP8_COMMON *const cm = & cpi->common; + MACROBLOCKD *const xd = & x->e_mbd; + int i; + (void) mb_row; + + for (i = 0; i < count; i++) + { + MACROBLOCK *mb = & mbr_ei[i].mb; + MACROBLOCKD *mbd = &mb->e_mbd; + + mbd->subpixel_predict = xd->subpixel_predict; + mbd->subpixel_predict8x4 = xd->subpixel_predict8x4; + mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; + mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; +#if CONFIG_RUNTIME_CPU_DETECT + mbd->rtcd = xd->rtcd; +#endif + mbd->gf_active_ptr = xd->gf_active_ptr; + + mb->vector_range = 32; + + vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts)); + mbr_ei[i].totalrate = 0; + + mbd->mode_info = cm->mi - 1; + mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1); + mbd->mode_info_stride = cm->mode_info_stride; + + mbd->frame_type = cm->frame_type; + + mbd->frames_since_golden = cm->frames_since_golden; + mbd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame; + + mb->src = * cpi->Source; + mbd->pre = cm->last_frame; + mbd->dst = cm->new_frame; + + mb->src.y_buffer += 16 * x->src.y_stride * (i + 1); + mb->src.u_buffer += 8 * x->src.uv_stride * (i + 1); + mb->src.v_buffer += 8 * x->src.uv_stride * (i + 1); + + + vp8_build_block_offsets(mb); + + vp8_setup_block_dptrs(mbd); + + vp8_setup_block_ptrs(mb); + + mb->rddiv = cpi->RDDIV; + mb->rdmult = cpi->RDMULT; + + mbd->mbmi.mode = DC_PRED; + mbd->mbmi.uv_mode = DC_PRED; + + mbd->left_context = cm->left_context; + mb->mvc = cm->fc.mvc; + + setup_mbby_copy(&mbr_ei[i].mb, x); + + } +} + + +void vp8cx_create_encoder_threads(VP8_COMP *cpi) +{ + cpi->b_multi_threaded = 0; + + cpi->processor_core_count = 32; //vp8_get_proc_core_count(); + + CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows)); + +#if CONFIG_MULTITHREAD + + if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) + { + int ithread; + + if (cpi->oxcf.multi_threaded > cpi->processor_core_count) + cpi->encoding_thread_count = cpi->processor_core_count - 1; + else + cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1; + + + CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count)); + CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count)); + CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count)); + vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count); + CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count)); + //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL); + sem_init(&cpi->h_event_main, 0, 0); + + cpi->b_multi_threaded = 1; + + //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1)); + + for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++) + { + //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL); + sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0); + cpi->en_thread_data[ithread].ithread = ithread; + cpi->en_thread_data[ithread].ptr1 = (void *)cpi; + cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread]; + + //printf(" call begin thread %d \n", ithread); + + //cpi->h_encoding_thread[ithread] = (HANDLE)_beginthreadex( + // NULL, // security + // 0, // stksize + // thread_encoding_proc, + // (&cpi->en_thread_data[ithread]), // Thread data + // 0, + // NULL); + + pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread])); + + } + + } + +#endif +} + +void vp8cx_remove_encoder_threads(VP8_COMP *cpi) +{ +#if CONFIG_MULTITHREAD + + if (cpi->b_multi_threaded) + { + //shutdown other threads + cpi->b_multi_threaded = 0; + { + int i; + + for (i = 0; i < cpi->encoding_thread_count; i++) + { + //SetEvent(cpi->h_event_mbrencoding[i]); + sem_post(&cpi->h_event_mbrencoding[i]); + pthread_join(cpi->h_encoding_thread[i], 0); + } + + for (i = 0; i < cpi->encoding_thread_count; i++) + sem_destroy(&cpi->h_event_mbrencoding[i]); + } + //free thread related resources + vpx_free(cpi->h_event_mbrencoding); + vpx_free(cpi->h_encoding_thread); + vpx_free(cpi->mb_row_ei); + vpx_free(cpi->en_thread_data); + } + +#endif + vpx_free(cpi->tplist); +} diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c new file mode 100644 index 000000000..c519080b2 --- /dev/null +++ b/vp8/encoder/firstpass.c @@ -0,0 +1,2512 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "math.h" +#include "limits.h" +#include "block.h" +#include "onyx_int.h" +#include "variance.h" +#include "encodeintra.h" +#include "setupintrarecon.h" +#include "mcomp.h" +#include "vpx_scale/vpxscale.h" +#include "encodemb.h" +#include "extend.h" +#include "systemdependent.h" +#include "vpx_scale/yv12extend.h" +#include "vpx_mem/vpx_mem.h" +#include "swapyv12buffer.h" +#include <stdio.h> +#include "rdopt.h" +#include "quant_common.h" +#include "encodemv.h" + +//#define OUTPUT_FPF 1 +//#define FIRSTPASS_MM 1 + +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif + +extern void vp8_build_block_offsets(MACROBLOCK *x); +extern void vp8_setup_block_ptrs(MACROBLOCK *x); +extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi); +extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv); +extern void vp8_alloc_compressor_data(VP8_COMP *cpi); + +//#define GFQ_ADJUSTMENT (40 + ((15*Q)/10)) +//#define GFQ_ADJUSTMENT (80 + ((15*Q)/10)) +#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q] +extern int vp8_kf_boost_qadjustment[QINDEX_RANGE]; + +extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE]; + +#define IIFACTOR 1.4 +#define IIKFACTOR1 1.40 +#define IIKFACTOR2 1.5 +#define RMAX 14.0 +#define GF_RMAX 48.0 // 128.0 + +#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001) + +#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0 +#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0 + +static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3}; +static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3}; + + +void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame); +int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps); + +int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) +{ + + int i; + int intra_pred_var = 0; + (void) cpi; + + if (use_dc_pred) + { + x->e_mbd.mbmi.mode = DC_PRED; + x->e_mbd.mbmi.uv_mode = DC_PRED; + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + + vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x); + } + else + { + for (i = 0; i < 16; i++) + { + BLOCKD *b = &x->e_mbd.block[i]; + BLOCK *be = &x->block[i]; + + vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, be, b, B_DC_PRED); + } + } + + intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff); + + return intra_pred_var; +} + +// Resets the first pass file to the given position using a relative seek from the current position +static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position) +{ + cpi->stats_in = Position; +} + +static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) +{ + /*FIRSTPASS_STATS * start_pos; + int ret_val; + + start_pos = cpi->stats_in; + ret_val = vp8_input_stats(cpi, next_frame); + reset_fpf_position(cpi, start_pos); + + return ret_val;*/ + + if (cpi->stats_in >= cpi->stats_in_end) + return EOF; + + *next_frame = *cpi->stats_in; + return 1; +} + +// Calculate a modified Error used in distributing bits between easier and harder frames +static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) +{ + double av_err = cpi->total_stats.ssim_weighted_pred_err; + double this_err = this_frame->ssim_weighted_pred_err; + double modified_err; + + //double relative_next_iiratio; + //double next_iiratio; + //double sum_iiratio; + //int i; + + //FIRSTPASS_STATS next_frame; + //FIRSTPASS_STATS *start_pos; + + /*start_pos = cpi->stats_in; + sum_iiratio = 0.0; + i = 0; + while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF ) + { + + next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error); + next_iiratio = ( next_iiratio < 1.0 ) ? 1.0 : (next_iiratio > 20.0) ? 20.0 : next_iiratio; + sum_iiratio += next_iiratio; + i++; + } + if ( i > 0 ) + { + relative_next_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK(cpi->avg_iiratio * (double)i); + } + else + { + relative_next_iiratio = 1.0; + } + reset_fpf_position(cpi, start_pos);*/ + + if (this_err > av_err) + modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1); + else + modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2); + + /* + relative_next_iiratio = pow(relative_next_iiratio,0.25); + modified_err = modified_err * relative_next_iiratio; + */ + + return modified_err; +} + +double vp8_simple_weight(YV12_BUFFER_CONFIG *source) +{ + int i, j; + int Total = 0; + + unsigned char *src = source->y_buffer; + unsigned char value; + double sum_weights = 0.0; + double Weight; + + // Loop throught the Y plane raw examining levels and creating a weight for the image + for (i = 0; i < source->y_height; i++) + { + for (j = 0; j < source->y_width; j++) + { + value = src[j]; + + if (value >= 64) + Weight = 1.0; + else if (value > 32) + Weight = (value - 32.0f) / 32.0f; + else + Weight = 0.02; + + sum_weights += Weight; + } + + src += source->y_stride; + } + + sum_weights /= (source->y_height * source->y_width); + + return sum_weights; +} + +// This function returns the current per frame maximum bitrate target +int frame_max_bits(VP8_COMP *cpi) +{ + // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left + int max_bits; + + // For CBR we need to also consider buffer fullness. + // If we are running below the optimal level then we need to gradually tighten up on max_bits. + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + double buffer_fullness_ratio = (double)DOUBLE_DIVIDE_CHECK(cpi->buffer_level) / (double)cpi->oxcf.optimal_buffer_level; + + // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user + max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + + // If our buffer is below the optimum level + if (buffer_fullness_ratio < 1.0) + { + // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. + int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2; + + max_bits = (int)(max_bits * buffer_fullness_ratio); + + if (max_bits < min_max_bits) + max_bits = min_max_bits; // Lowest value we will set ... which should allow the buffer to refil. + } + } + // VBR + else + { + // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user + max_bits = (int)(((double)cpi->bits_left / (cpi->total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + } + + // Trap case where we are out of bits + if (max_bits < 0) + max_bits = 0; + + return max_bits; +} + +void vp8_output_stats(struct vpx_codec_pkt_list *pktlist, + FIRSTPASS_STATS *stats) +{ + struct vpx_codec_cx_pkt pkt; + pkt.kind = VPX_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(*stats); + vpx_codec_pkt_list_add(pktlist, &pkt); + +// TEMP debug code +#ifdef OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n", + stats->frame, + stats->intra_error, + stats->coded_error, + stats->ssim_weighted_pred_err, + stats->pcnt_inter, + stats->pcnt_motion, + stats->pcnt_second_ref, + stats->MVr, + stats->mvr_abs, + stats->MVc, + stats->mvc_abs, + stats->MVrv, + stats->MVcv, + stats->mv_in_out_count, + stats->count); + fclose(fpfile); + } +#endif +} + +int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) +{ + if (cpi->stats_in >= cpi->stats_in_end) + return EOF; + + *fps = *cpi->stats_in++; + return 1; +} + +void vp8_zero_stats(FIRSTPASS_STATS *section) +{ + section->frame = 0.0; + section->intra_error = 0.0; + section->coded_error = 0.0; + section->ssim_weighted_pred_err = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->count = 0.0; + section->duration = 1.0; +} +void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) +{ + section->frame += frame->frame; + section->intra_error += frame->intra_error; + section->coded_error += frame->coded_error; + section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->count += frame->count; + section->duration += frame->duration; +} +void vp8_avg_stats(FIRSTPASS_STATS *section) +{ + if (section->count < 1.0) + return; + + section->intra_error /= section->count; + section->coded_error /= section->count; + section->ssim_weighted_pred_err /= section->count; + section->pcnt_inter /= section->count; + section->pcnt_second_ref /= section->count; + section->pcnt_motion /= section->count; + section->MVr /= section->count; + section->mvr_abs /= section->count; + section->MVc /= section->count; + section->mvc_abs /= section->count; + section->MVrv /= section->count; + section->MVcv /= section->count; + section->mv_in_out_count /= section->count; + section->duration /= section->count; +} + +int vp8_fpmm_get_pos(VP8_COMP *cpi) +{ + return ftell(cpi->fp_motion_mapfile); +} +void vp8_fpmm_reset_pos(VP8_COMP *cpi, int target_pos) +{ + int Offset; + + if (cpi->fp_motion_mapfile) + { + Offset = ftell(cpi->fp_motion_mapfile) - target_pos; + fseek(cpi->fp_motion_mapfile, (int) - Offset, SEEK_CUR); + } +} + +void vp8_advance_fpmm(VP8_COMP *cpi, int count) +{ +#ifdef FIRSTPASS_MM + fseek(cpi->fp_motion_mapfile, (int)(count * cpi->common.MBs), SEEK_CUR); +#endif +} + +void vp8_input_fpmm(VP8_COMP *cpi, int count) +{ +#ifdef FIRSTPASS_MM + + unsigned char *tmp_motion_map; + int i, j; + + if (!cpi->fp_motion_mapfile) + return; // Error + + // Create the first pass motion map structure and set to 0 + CHECK_MEM_ERROR(tmp_motion_map, vpx_calloc(cpi->common.MBs, 1)); + + // Reset the state of the global map + vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs); + + // Read the specified number of frame maps and set the global map to the highest value seen for each mb. + for (i = 0; i < count; i++) + { + if (fread(tmp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile) == cpi->common.MBs) + { + for (j = 0; j < cpi->common.MBs; j++) + { + if (tmp_motion_map[j] > 1) + cpi->fp_motion_map[j] += 5; // Intra is flagged + else + cpi->fp_motion_map[j] += tmp_motion_map[j]; + } + } + else + break; // Read error + + } + + if (tmp_motion_map != 0) + vpx_free(tmp_motion_map); + +#endif + +} + +void vp8_init_first_pass(VP8_COMP *cpi) +{ + vp8_zero_stats(&cpi->total_stats); + +#ifdef FIRSTPASS_MM + cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "wb"); +#endif + +// TEMP debug code +#ifdef OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "w"); + fclose(fpfile); + } +#endif + +} + +void vp8_end_first_pass(VP8_COMP *cpi) +{ + vp8_output_stats(cpi->output_pkt_list, &cpi->total_stats); + +#ifdef FIRSTPASS_MM + + if (cpi->fp_motion_mapfile) + fclose(cpi->fp_motion_mapfile); + +#endif + +} +void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset ) +{ + MACROBLOCKD * const xd = & x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + + unsigned char *src_ptr = (*(b->base_src) + b->src); + int src_stride = b->src_stride; + unsigned char *ref_ptr; + int ref_stride=d->pre_stride; + + // Set up pointers for this macro block recon buffer + xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; + + ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre ); + + VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err)); +} + + +void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset ) +{ + MACROBLOCKD *const xd = & x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + int num00; + + MV tmp_mv = {0, 0}; + + int tmp_err; + int step_param = 3; //3; // Dont search over full range for first pass + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3; + int n; + vp8_variance_fn_ptr_t v_fn_ptr; + int new_mv_mode_penalty = 256; + + v_fn_ptr.vf = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16); + v_fn_ptr.sdf = cpi->fn_ptr.sdf; + v_fn_ptr.sdx4df = cpi->fn_ptr.sdx4df; + + // Set up pointers for this macro block recon buffer + xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; + + // Initial step/diamond search centred on best mv + tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost); + if ( tmp_err < INT_MAX-new_mv_mode_penalty ) + tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) + { + *best_motion_err = tmp_err; + best_mv->row = tmp_mv.row; + best_mv->col = tmp_mv.col; + } + + // Further step/diamond searches as necessary + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost); + if ( tmp_err < INT_MAX-new_mv_mode_penalty ) + tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) + { + *best_motion_err = tmp_err; + best_mv->row = tmp_mv.row; + best_mv->col = tmp_mv.col; + } + } + } +} + +void vp8_first_pass(VP8_COMP *cpi) +{ + int mb_row, mb_col; + MACROBLOCK *const x = & cpi->mb; + VP8_COMMON *const cm = & cpi->common; + MACROBLOCKD *const xd = & x->e_mbd; + + int col_blocks = 4 * cm->mb_cols; + int recon_yoffset, recon_uvoffset; + int recon_y_stride = cm->last_frame.y_stride; + int recon_uv_stride = cm->last_frame.uv_stride; + int intra_error = 0; + int coded_error = 0; + + int sum_mvr = 0, sum_mvc = 0; + int sum_mvr_abs = 0, sum_mvc_abs = 0; + int sum_mvrs = 0, sum_mvcs = 0; + int mvcount = 0; + int intercount = 0; + int second_ref_count = 0; + int intrapenalty = 256; + + int sum_in_vectors = 0; + + MV best_ref_mv = {0, 0}; + MV zero_ref_mv = {0, 0}; + + unsigned char *fp_motion_map_ptr = cpi->fp_motion_map; + + vp8_clear_system_state(); //__asm emms; + + x->src = * cpi->Source; + xd->pre = cm->last_frame; + xd->dst = cm->new_frame; + + vp8_build_block_offsets(x); + + vp8_setup_block_dptrs(&x->e_mbd); + + vp8_setup_block_ptrs(x); + + // set up frame new frame for intra coded blocks + vp8_setup_intra_recon(&cm->new_frame); + vp8cx_frame_init_quantizer(cpi); + + // Initialise the MV cost table to the defaults + //if( cm->current_video_frame == 0) + //if ( 0 ) + { + int flag[2] = {1, 1}; + vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q)); + vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); + vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag); + } + + // for each macroblock row in image + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + MV best_ref_mv = {0, 0}; + + // reset above block coeffs + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + // for each macroblock col in image + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int this_error; + int gf_motion_error = INT_MAX; + int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + + xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset; + xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset; + xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + // do intra 16x16 prediction + this_error = vp8_encode_intra(cpi, x, use_dc_pred); + + // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame) + // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv. + // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; + + // Cumulative intra error total + intra_error += this_error; + + // Indicate default assumption of intra in the motion map + *fp_motion_map_ptr = 2; + + // Set up limit values for motion vectors to prevent them extending outside the UMV borders + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + // Other than for the first frame do a motion search + if (cm->current_video_frame > 0) + { + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MV tmp_mv = {0, 0}; + int tmp_err; + int motion_error = INT_MAX; + + // Simple 0,0 motion with no mv overhead + vp8_zz_motion_search( cpi, x, &cm->last_frame, &motion_error, recon_yoffset ); + d->bmi.mv.as_mv.row = 0; + d->bmi.mv.as_mv.col = 0; + + // Test last reference frame using the previous best mv as the starting point (best reference) for the search + vp8_first_pass_motion_search(cpi, x, &best_ref_mv, &d->bmi.mv.as_mv, &cm->last_frame, &motion_error, recon_yoffset); + + // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well + if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0)) + { + tmp_err = INT_MAX; + vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->last_frame, &motion_error, recon_yoffset); + + if ( tmp_err < motion_error ) + { + motion_error = tmp_err; + d->bmi.mv.as_mv.row = tmp_mv.row; + d->bmi.mv.as_mv.col = tmp_mv.col; + } + + } + + // Experimental search in a second reference frame ((0,0) based only) + if (cm->current_video_frame > 1) + { + vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->golden_frame, &gf_motion_error, recon_yoffset); + + if ((gf_motion_error < motion_error) && (gf_motion_error < this_error)) + { + second_ref_count++; + //motion_error = gf_motion_error; + //d->bmi.mv.as_mv.row = tmp_mv.row; + //d->bmi.mv.as_mv.col = tmp_mv.col; + } + /*else + { + xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset; + xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset; + xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset; + }*/ + + + // Reset to last frame as reference buffer + xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset; + xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset; + xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset; + } + + if (motion_error <= this_error) + { + d->bmi.mv.as_mv.row <<= 3; + d->bmi.mv.as_mv.col <<= 3; + this_error = motion_error; + vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv.as_mv); + vp8_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x); + sum_mvr += d->bmi.mv.as_mv.row; + sum_mvr_abs += abs(d->bmi.mv.as_mv.row); + sum_mvc += d->bmi.mv.as_mv.col; + sum_mvc_abs += abs(d->bmi.mv.as_mv.col); + sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row; + sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col; + intercount++; + + best_ref_mv.row = d->bmi.mv.as_mv.row; + best_ref_mv.col = d->bmi.mv.as_mv.col; + //best_ref_mv.row = 0; + //best_ref_mv.col = 0; + + // Was the vector non-zero + if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col) + { + mvcount++; + + *fp_motion_map_ptr = 1; + + // Does the Row vector point inwards or outwards + if (mb_row < cm->mb_rows / 2) + { + if (d->bmi.mv.as_mv.row > 0) + sum_in_vectors--; + else if (d->bmi.mv.as_mv.row < 0) + sum_in_vectors++; + } + else if (mb_row > cm->mb_rows / 2) + { + if (d->bmi.mv.as_mv.row > 0) + sum_in_vectors++; + else if (d->bmi.mv.as_mv.row < 0) + sum_in_vectors--; + } + + // Does the Row vector point inwards or outwards + if (mb_col < cm->mb_cols / 2) + { + if (d->bmi.mv.as_mv.col > 0) + sum_in_vectors--; + else if (d->bmi.mv.as_mv.col < 0) + sum_in_vectors++; + } + else if (mb_col > cm->mb_cols / 2) + { + if (d->bmi.mv.as_mv.col > 0) + sum_in_vectors++; + else if (d->bmi.mv.as_mv.col < 0) + sum_in_vectors--; + } + } + else + *fp_motion_map_ptr = 0; // 0,0 mv was best + } + else + { + best_ref_mv.row = 0; + best_ref_mv.col = 0; + } + } + + coded_error += this_error; + + // adjust to the next column of macroblocks + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + // Update the motion map + fp_motion_map_ptr++; + } + + // adjust to the next row of mbs + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + + //extend the recon for intra prediction + vp8_extend_mb_row(&cm->new_frame, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + vp8_clear_system_state(); //__asm emms; + } + + vp8_clear_system_state(); //__asm emms; + { + double weight = 0.0; + double weigth2 = 0.0; + + FIRSTPASS_STATS fps; + + fps.frame = cm->current_video_frame ; + fps.intra_error = intra_error >> 8; + fps.coded_error = coded_error >> 8; + weight = vp8_simple_weight(cpi->Source); + + if (weight < 0.1) + weight = 0.1; + + fps.ssim_weighted_pred_err = fps.coded_error * weight; + + fps.pcnt_inter = 0.0; + fps.pcnt_motion = 0.0; + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.count = 1.0; + + fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs; + fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs; + + if (mvcount > 0) + { + fps.MVr = (double)sum_mvr / (double)mvcount; + fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount; + fps.MVc = (double)sum_mvc / (double)mvcount; + fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount; + fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount; + fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount; + fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2); + + fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs; + } + + // TODO: handle the case when duration is set to 0, or something less + // than the full time between subsequent cpi->source_time_stamp s . + fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp; + + // don't want to do outputstats with a stack variable! + cpi->this_frame_stats = fps; + vp8_output_stats(cpi->output_pkt_list, &cpi->this_frame_stats); + vp8_accumulate_stats(&cpi->total_stats, &fps); + +#ifdef FIRSTPASS_MM + fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile); +#endif + } + + // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met + if ((cm->current_video_frame > 0) && + (cpi->this_frame_stats.pcnt_inter > 0.20) && + ((cpi->this_frame_stats.intra_error / cpi->this_frame_stats.coded_error) > 2.0)) + { + vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame); + } + + // swap frame pointers so last frame refers to the frame we just compressed + vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame); + vp8_yv12_extend_frame_borders(&cm->last_frame); + + // Special case for the first frame. Copy into the GF buffer as a second reference. + if (cm->current_video_frame == 0) + { + vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame); + } + + + // use this to see what the first pass reconstruction looks like + if (0) + { + char filename[512]; + FILE *recon_file; + sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); + + if (cm->current_video_frame == 0) + recon_file = fopen(filename, "wb"); + else + recon_file = fopen(filename, "ab"); + + fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file); + fclose(recon_file); + } + + cm->current_video_frame++; + +} +extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; + +#define BASE_ERRPERMB 150 +static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +{ + int Q; + int num_mbs = ((Height * Width) / (16 * 16)); + int target_norm_bits_per_mb; + + double err_per_mb = section_err / num_mbs; + double correction_factor; + double corr_high; + double speed_correction = 1.0; + double rolling_ratio; + + double pow_highq = 0.90; + double pow_lowq = 0.40; + + if (section_target_bandwitdh <= 0) + return MAXQ; + + target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); + + // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits + if ((cpi->rolling_target_bits > 0.0) && (cpi->active_worst_quality < cpi->worst_quality)) + { + //double adjustment_rate = 0.985 + (0.00005 * cpi->active_worst_quality); + double adjustment_rate = 0.99; + + rolling_ratio = (double)cpi->rolling_actual_bits / (double)cpi->rolling_target_bits; + + //if ( cpi->est_max_qcorrection_factor > rolling_ratio ) + if (rolling_ratio < 0.95) + //cpi->est_max_qcorrection_factor *= adjustment_rate; + cpi->est_max_qcorrection_factor -= 0.005; + //else if ( cpi->est_max_qcorrection_factor < rolling_ratio ) + else if (rolling_ratio > 1.05) + cpi->est_max_qcorrection_factor += 0.005; + + //cpi->est_max_qcorrection_factor /= adjustment_rate; + + cpi->est_max_qcorrection_factor = (cpi->est_max_qcorrection_factor < 0.1) ? 0.1 : (cpi->est_max_qcorrection_factor > 10.0) ? 10.0 : cpi->est_max_qcorrection_factor; + } + + // Corrections for higher compression speed settings (reduced compression expected) + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) + { + if (cpi->oxcf.cpu_used <= 5) + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + else + speed_correction = 1.25; + } + + // Correction factor used for Q values >= 20 + corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq); + corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high; + + // Try and pick a Q that should be high enough to encode the content at the given rate. + for (Q = 0; Q < MAXQ; Q++) + { + int bits_per_mb_at_this_q; + + if (Q < 50) + { + correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01)); + correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor; + } + else + correction_factor = corr_high; + + bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * cpi->section_max_qfactor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0); + //bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) + break; + } + + return Q; +} +static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +{ + int Q; + int num_mbs = ((Height * Width) / (16 * 16)); + int target_norm_bits_per_mb; + + double err_per_mb = section_err / num_mbs; + double correction_factor; + double corr_high; + double speed_correction = 1.0; + double pow_highq = 0.90; + double pow_lowq = 0.40; + + target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); + + // Corrections for higher compression speed settings (reduced compression expected) + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) + { + if (cpi->oxcf.cpu_used <= 5) + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + else + speed_correction = 1.25; + } + + // Correction factor used for Q values >= 20 + corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq); + corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high; + + // Try and pick a Q that can encode the content at the given rate. + for (Q = 0; Q < MAXQ; Q++) + { + int bits_per_mb_at_this_q; + + if (Q < 50) + { + correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01)); + correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor; + } + else + correction_factor = corr_high; + + bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) + break; + } + + return Q; +} + +// Estimate a worst case Q for a KF group +static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio) +{ + int Q; + int num_mbs = ((Height * Width) / (16 * 16)); + int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs; + int bits_per_mb_at_this_q; + + double err_per_mb = section_err / num_mbs; + double err_correction_factor; + double corr_high; + double speed_correction = 1.0; + double current_spend_ratio = 1.0; + + double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90; + double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80; + + double iiratio_correction_factor = 1.0; + + double combined_correction_factor; + + // Trap special case where the target is <= 0 + if (target_norm_bits_per_mb <= 0) + return MAXQ * 2; + + // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits + // This is clamped to the range 0.1 to 10.0 + if (cpi->long_rolling_target_bits <= 0) + current_spend_ratio = 10.0; + else + { + current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits; + current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio; + } + + // Calculate a correction factor based on the quality of prediction in the sequence as indicated by intra_inter error score ratio (IIRatio) + // The idea here is to favour subsampling in the hardest sections vs the easyest. + iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1); + + if (iiratio_correction_factor < 0.5) + iiratio_correction_factor = 0.5; + + // Corrections for higher compression speed settings (reduced compression expected) + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) + { + if (cpi->oxcf.cpu_used <= 5) + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + else + speed_correction = 1.25; + } + + // Combine the various factors calculated above + combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio; + + // Correction factor used for Q values >= 20 + corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq); + corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high; + + // Try and pick a Q that should be high enough to encode the content at the given rate. + for (Q = 0; Q < MAXQ; Q++) + { + // Q values < 20 treated as a special case + if (Q < 20) + { + err_correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01)); + err_correction_factor = (err_correction_factor < 0.05) ? 0.05 : (err_correction_factor > 5.0) ? 5.0 : err_correction_factor; + } + else + err_correction_factor = corr_high; + + bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * combined_correction_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q]); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) + break; + } + + // If we could not hit the target even at Max Q then estimate what Q would have bee required + while ((bits_per_mb_at_this_q > target_norm_bits_per_mb) && (Q < (MAXQ * 2))) + { + + bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q); + Q++; + } + + if (0) + { + FILE *f = fopen("estkf_q.stt", "a"); + fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q, + target_norm_bits_per_mb, err_per_mb, err_correction_factor, + current_spend_ratio, group_iiratio, iiratio_correction_factor, + (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q); + fclose(f); + } + + return Q; +} +extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate); + +void vp8_init_second_pass(VP8_COMP *cpi) +{ + FIRSTPASS_STATS this_frame; + FIRSTPASS_STATS *start_pos; + + double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + + vp8_zero_stats(&cpi->total_stats); + + if (!cpi->stats_in_end) + return; + + cpi->total_stats = *cpi->stats_in_end; + + cpi->total_error_left = cpi->total_stats.ssim_weighted_pred_err; + cpi->total_intra_error_left = cpi->total_stats.intra_error; + cpi->total_coded_error_left = cpi->total_stats.coded_error; + cpi->start_tot_err_left = cpi->total_error_left; + + //cpi->bits_left = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate)); + //cpi->bits_left -= (long long)(cpi->total_stats.count * two_pass_min_rate / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate)); + + // each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However the sum duration is not. + // Its calculated based on the actual durations of all frames from the first + // pass. + vp8_new_frame_rate(cpi, 10000000.0 * cpi->total_stats.count / cpi->total_stats.duration); + + cpi->output_frame_rate = cpi->oxcf.frame_rate; + cpi->bits_left = (long long)(cpi->total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ; + cpi->bits_left -= (long long)(cpi->total_stats.duration * two_pass_min_rate / 10000000.0); + + vp8_avg_stats(&cpi->total_stats); + + // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence + { + double sum_iiratio = 0.0; + double IIRatio; + + start_pos = cpi->stats_in; // Note starting "file" position + + while (vp8_input_stats(cpi, &this_frame) != EOF) + { + IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); + IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio; + sum_iiratio += IIRatio; + } + + cpi->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->total_stats.count); + + // Reset file position + reset_fpf_position(cpi, start_pos); + } + + // Scan the first pass file and calculate a modified total error based upon the bias/power function + // used to allocate bits + { + start_pos = cpi->stats_in; // Note starting "file" position + + cpi->modified_total_error_left = 0.0; + + while (vp8_input_stats(cpi, &this_frame) != EOF) + { + cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame); + } + + reset_fpf_position(cpi, start_pos); // Reset file position + + } + +#ifdef FIRSTPASS_MM + cpi->fp_motion_mapfile = 0; + cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "rb"); +#endif + +} + +void vp8_end_second_pass(VP8_COMP *cpi) +{ +#ifdef FIRSTPASS_MM + + if (cpi->fp_motion_mapfile) + fclose(cpi->fp_motion_mapfile); + +#endif +} + +// Analyse and define a gf/arf group . +static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) +{ + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS *start_pos; + int i; + int count = 0; + int image_size = cpi->common.last_frame.y_width * cpi->common.last_frame.y_height; + double boost_score = 0.0; + double old_boost_score = 0.0; + double gf_group_err = 0.0; + double gf_first_frame_err = 0.0; + double mod_frame_err = 0.0; + + double mv_accumulator_rabs = 0.0; + double mv_accumulator_cabs = 0.0; + double this_mv_rabs; + double this_mv_cabs; + double mv_ratio_accumulator = 0.0; + double distance_factor = 0.0; + double decay_accumulator = 1.0; + + double boost_factor = IIFACTOR; + double loop_decay_rate = 1.00; // Starting decay rate + + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + double mod_err_per_mb_accumulator = 0.0; + + int max_bits = frame_max_bits(cpi); // Max for a single frame + +#ifdef FIRSTPASS_MM + int fpmm_pos; +#endif + + cpi->gf_group_bits = 0; + cpi->gf_decay_rate = 0; + + vp8_clear_system_state(); //__asm emms; + +#ifdef FIRSTPASS_MM + fpmm_pos = vp8_fpmm_get_pos(cpi); +#endif + + start_pos = cpi->stats_in; + + // Preload the stats for the next frame. + mod_frame_err = calculate_modified_err(cpi, this_frame); + + // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf + gf_first_frame_err = mod_frame_err; + + // Special treatment if the current frame is a key frame (which is also a gf). + // If it is then its error score (and hence bit allocation) need to be subtracted out + // from the calculation for the GF group + if (cpi->common.frame_type == KEY_FRAME) + gf_group_err -= gf_first_frame_err; + + // Scan forward to try and work out how many frames the next gf group should contain and + // what level of boost is appropriate for the GF or ARF that will be coded with the group + i = 0; + + while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key)) + { + double r; + double motion_factor; + double this_frame_mvr_ratio; + double this_frame_mvc_ratio; + + i++; // Increment the loop counter + + // Accumulate error score of frames in this gf group + mod_frame_err = calculate_modified_err(cpi, this_frame); + + gf_group_err += mod_frame_err; + + mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs); + + if (EOF == vp8_input_stats(cpi, &next_frame)) + break; + + // Accumulate motion stats. + motion_factor = next_frame.pcnt_motion; + this_mv_rabs = fabs(next_frame.mvr_abs * motion_factor); + this_mv_cabs = fabs(next_frame.mvc_abs * motion_factor); + + mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_factor); + mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_factor); + + //Accumulate Motion In/Out of frame stats + this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion; + mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion; + abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion); + + // If there is a significant amount of motion + if (motion_factor > 0.05) + { + this_frame_mvr_ratio = fabs(next_frame.mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVr)); + this_frame_mvc_ratio = fabs(next_frame.mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVc)); + + mv_ratio_accumulator += (this_frame_mvr_ratio < next_frame.mvr_abs) ? (this_frame_mvr_ratio * motion_factor) : next_frame.mvr_abs * motion_factor; + mv_ratio_accumulator += (this_frame_mvc_ratio < next_frame.mvc_abs) ? (this_frame_mvc_ratio * motion_factor) : next_frame.mvc_abs * motion_factor; + } + else + { + mv_ratio_accumulator += 0.0; + this_frame_mvr_ratio = 1.0; + this_frame_mvc_ratio = 1.0; + } + + // Underlying boost factor is based on inter intra error ratio + r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error))); + + // Increase boost for frames where new data coming into frame (eg zoom out) + // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in) + // The range for this_frame_mv_in_out is -1.0 to +1.0 + if (this_frame_mv_in_out > 0.0) + r += r * (this_frame_mv_in_out * 2.0); + else + r += r * (this_frame_mv_in_out / 2.0); // In extreme case boost is halved + + if (r > GF_RMAX) + r = GF_RMAX; + + // Adjust loop decay rate + //if ( next_frame.pcnt_inter < loop_decay_rate ) + loop_decay_rate = next_frame.pcnt_inter; + + // High % motion -> somewhat higher decay rate + if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate) + loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0)); + + distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + (this_mv_cabs * this_mv_cabs)) / 300.0; + distance_factor = ((distance_factor > 1.0) ? 0.0 : (1.0 - distance_factor)); + + if (distance_factor < loop_decay_rate) + loop_decay_rate = distance_factor; + + // Cumulative effect of decay + decay_accumulator = decay_accumulator * loop_decay_rate; + decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; + //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator; + + boost_score += (decay_accumulator * r); + + // Break out conditions. + if ( /* i>4 || */ + ( + (i > MIN_GF_INTERVAL) && // Dont break out with a very short interval + ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) && // Dont break out very close to a key frame + ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && + ((mv_ratio_accumulator > 100.0) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < 2.0) + ) + ) + ) + { + boost_score = old_boost_score; + break; + } + + vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame)); + + old_boost_score = boost_score; + } + + cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0; + + // When using CBR apply additional buffer related upper limits + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + double max_boost; + + // For cbr apply buffer related limits + if (cpi->drop_frames_allowed) + { + int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100); + + if (cpi->buffer_level > df_buffer_level) + max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + else + max_boost = 0.0; + } + else if (cpi->buffer_level > 0) + { + max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } + else + { + max_boost = 0.0; + } + + if (boost_score > max_boost) + boost_score = max_boost; + } + + cpi->gfu_boost = (int)(boost_score * 100.0) >> 4; + + // Should we use the alternate refernce frame + if (cpi->oxcf.play_alternate && + (i >= MIN_GF_INTERVAL) && + (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) && // dont use ARF very near next kf + (((next_frame.pcnt_inter > 0.75) && + ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) && + //(cpi->gfu_boost>150) && + (cpi->gfu_boost > 100) && + //(cpi->gfu_boost>AF_THRESH2) && + //((cpi->gfu_boost/i)>AF_THRESH) && + //(decay_accumulator > 0.5) && + (cpi->gf_decay_rate <= (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))) + ) + ) + ) + { + int Boost; + int allocation_chunks; + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int tmp_q; + int arf_frame_bits = 0; + int group_bits; + + // Estimate the bits to be allocated to the group as a whole + if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0)) + group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left)); + else + group_bits = 0; + + // Boost for arf frame + Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); + Boost += (cpi->baseline_gf_interval * 50); + allocation_chunks = (i * 100) + Boost; + + // Normalize Altboost and allocations chunck down to prevent overflow + while (Boost > 1000) + { + Boost /= 2; + allocation_chunks /= 2; + } + + // Calculate the number of bits to be spent on the arf based on the boost number + arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks)); + + // Estimate if there are enough bits available to make worthwhile use of an arf. + tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width); + + // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames. + if (tmp_q < cpi->worst_quality) + { + cpi->source_alt_ref_pending = TRUE; + + // For alt ref frames the error score for the end frame of the group (the alt ref frame) should not contribute to the group total and hence + // the number of bit allocated to the group. Rather it forms part of the next group (it is the GF at the start of the next group) + gf_group_err -= mod_frame_err; + + // Set the interval till the next gf or arf. For ARFs this is the number of frames to be coded before the future frame that is coded as an ARF. + // The future frame itself is part of the next group + cpi->baseline_gf_interval = i - 1; + +#ifdef FIRSTPASS_MM + // Read through the motion map to load up the entry for the ARF + { + int j; + + // Advance to the region of interest + // Current default 2 frames before to 2 frames after the ARF frame itsef + vp8_fpmm_reset_pos(cpi, cpi->fpmm_pos); + + for (j = 0; j < cpi->baseline_gf_interval - 2; j++) + vp8_advance_fpmm(cpi, 1); + + // Read / create a motion map for the region of interest + vp8_input_fpmm(cpi, 5); + } +#endif + } + else + { + cpi->source_alt_ref_pending = FALSE; + cpi->baseline_gf_interval = i; + } + } + else + { + cpi->source_alt_ref_pending = FALSE; + cpi->baseline_gf_interval = i; + } + + // Conventional GF + if (!cpi->source_alt_ref_pending) + { + // Dont allow conventional gf too near the next kf + if ((cpi->frames_to_key - cpi->baseline_gf_interval) < MIN_GF_INTERVAL) + { + while (cpi->baseline_gf_interval < cpi->frames_to_key) + { + if (EOF == vp8_input_stats(cpi, this_frame)) + break; + + cpi->baseline_gf_interval++; + + if (cpi->baseline_gf_interval < cpi->frames_to_key) + gf_group_err += calculate_modified_err(cpi, this_frame); + } + } + } + + // Now decide how many bits should be allocated to the GF group as a proportion of those remaining in the kf group. + // The final key frame group in the clip is treated as a special case where cpi->kf_group_bits is tied to cpi->bits_left. + // This is also important for short clips where there may only be one key frame. + if (cpi->frames_to_key >= (int)(cpi->total_stats.count - cpi->common.current_video_frame)) + { + cpi->kf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0; + } + + // Calculate the bits to be allocated to the group as a whole + if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0)) + cpi->gf_group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left)); + else + cpi->gf_group_bits = 0; + + cpi->gf_group_bits = (cpi->gf_group_bits < 0) ? 0 : (cpi->gf_group_bits > cpi->kf_group_bits) ? cpi->kf_group_bits : cpi->gf_group_bits; + + // Clip cpi->gf_group_bits based on user supplied data rate variability limit (cpi->oxcf.two_pass_vbrmax_section) + if (cpi->gf_group_bits > max_bits * cpi->baseline_gf_interval) + cpi->gf_group_bits = max_bits * cpi->baseline_gf_interval; + + // Reset the file position + reset_fpf_position(cpi, start_pos); + + // Assign bits to the arf or gf. + { + int Boost; + int frames_in_section; + int allocation_chunks; + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + + // For ARF frames + if (cpi->source_alt_ref_pending) + { + Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); + //Boost += (cpi->baseline_gf_interval * 25); + Boost += (cpi->baseline_gf_interval * 50); + + // Set max and minimum boost and hence minimum allocation + if (Boost > ((cpi->baseline_gf_interval + 1) * 200)) + Boost = ((cpi->baseline_gf_interval + 1) * 200); + else if (Boost < 125) + Boost = 125; + + frames_in_section = cpi->baseline_gf_interval + 1; + allocation_chunks = (frames_in_section * 100) + Boost; + } + // Else for standard golden frames + else + { + // boost based on inter / intra ratio of subsequent frames + Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100; + + // Set max and minimum boost and hence minimum allocation + if (Boost > (cpi->baseline_gf_interval * 150)) + Boost = (cpi->baseline_gf_interval * 150); + else if (Boost < 125) + Boost = 125; + + frames_in_section = cpi->baseline_gf_interval; + allocation_chunks = (frames_in_section * 100) + (Boost - 100); + } + + // Normalize Altboost and allocations chunck down to prevent overflow + while (Boost > 1000) + { + Boost /= 2; + allocation_chunks /= 2; + } + + // Calculate the number of bits to be spent on the gf or arf based on the boost number + cpi->gf_bits = (int)((double)Boost * (cpi->gf_group_bits / (double)allocation_chunks)); + + // If the frame that is to be boosted is simpler than the average for the gf/arf group then use an alternative calculation + // based on the error score of the frame itself + if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) + { + double alt_gf_grp_bits; + int alt_gf_bits; + + alt_gf_grp_bits = ((double)cpi->kf_group_bits * (mod_frame_err * (double)cpi->baseline_gf_interval) / (double)cpi->kf_group_error_left) ; + alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits / (double)allocation_chunks)); + + if (cpi->gf_bits > alt_gf_bits) + { + cpi->gf_bits = alt_gf_bits; + } + } + // Else if it is harder than other frames in the group make sure it at least receives an allocation in keeping with + // its relative error score, otherwise it may be worse off than an "un-boosted" frame + else + { + int alt_gf_bits = (int)((double)cpi->kf_group_bits * (mod_frame_err / (double)cpi->kf_group_error_left)); + + if (alt_gf_bits > cpi->gf_bits) + { + cpi->gf_bits = alt_gf_bits; + } + } + + // Apply an additional limit for CBR + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + if (cpi->gf_bits > (cpi->buffer_level >> 1)) + cpi->gf_bits = cpi->buffer_level >> 1; + } + + // Dont allow a negative value for gf_bits + if (cpi->gf_bits < 0) + cpi->gf_bits = 0; + + // Adjust KF group bits and error remainin + cpi->kf_group_error_left -= gf_group_err; + cpi->kf_group_bits -= cpi->gf_group_bits; + + if (cpi->kf_group_bits < 0) + cpi->kf_group_bits = 0; + + // Note the error score left in the remaining frames of the group. + // For normal GFs we want to remove the error score for the first frame of the group (except in Key frame case where this has already happened) + if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME) + cpi->gf_group_error_left = gf_group_err - gf_first_frame_err; + else + cpi->gf_group_error_left = gf_group_err; + + cpi->gf_group_bits -= cpi->gf_bits; + + if (cpi->gf_group_bits < 0) + cpi->gf_group_bits = 0; + + // Set aside some bits for a mid gf sequence boost + if ((cpi->gfu_boost > 150) && (cpi->baseline_gf_interval > 5)) + { + int pct_extra = (cpi->gfu_boost - 100) / 50; + pct_extra = (pct_extra > 10) ? 10 : pct_extra; + + cpi->mid_gf_extra_bits = (cpi->gf_group_bits * pct_extra) / 100; + cpi->gf_group_bits -= cpi->mid_gf_extra_bits; + } + else + cpi->mid_gf_extra_bits = 0; + + cpi->gf_bits += cpi->min_frame_bandwidth; // Add in minimum for a frame + } + + if (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) // Normal GF and not a KF + { + cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for this frame + } + + // Adjustment to estimate_max_q based on a measure of complexity of the section + if (cpi->common.frame_type != KEY_FRAME) + { + FIRSTPASS_STATS sectionstats; + double Ratio; + + vp8_zero_stats(§ionstats); + reset_fpf_position(cpi, start_pos); + + for (i = 0 ; i < cpi->baseline_gf_interval ; i++) + { + vp8_input_stats(cpi, &next_frame); + vp8_accumulate_stats(§ionstats, &next_frame); + } + + vp8_avg_stats(§ionstats); + + if (sectionstats.pcnt_motion < .17) + cpi->section_is_low_motion = 1; + else + cpi->section_is_low_motion = 0; + + if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45) + cpi->section_is_fast_motion = 1; + else + cpi->section_is_fast_motion = 0; + + cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + + Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) ) + //{ + cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025); + + if (cpi->section_max_qfactor < 0.80) + cpi->section_max_qfactor = 0.80; + + //} + //else + // cpi->section_max_qfactor = 1.0; + + reset_fpf_position(cpi, start_pos); + } + +#ifdef FIRSTPASS_MM + // Reset the First pass motion map file position + vp8_fpmm_reset_pos(cpi, fpmm_pos); +#endif +} + +// Allocate bits to a normal frame that is neither a gf an arf or a key frame. +static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) +{ + int target_frame_size; // gf_group_error_left + + double modified_err; + double err_fraction; // What portion of the remaining GF group error is used by this frame + + int max_bits = frame_max_bits(cpi); // Max for a single frame + + // The final few frames have special treatment + if (cpi->frames_till_gf_update_due >= (int)(cpi->total_stats.count - cpi->common.current_video_frame)) + { + cpi->gf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;; + } + + // Calculate modified prediction error used in bit allocation + modified_err = calculate_modified_err(cpi, this_frame); + + if (cpi->gf_group_error_left > 0) + err_fraction = modified_err / cpi->gf_group_error_left; // What portion of the remaining GF group error is used by this frame + else + err_fraction = 0.0; + + target_frame_size = (int)((double)cpi->gf_group_bits * err_fraction); // How many of those bits available for allocation should we give it? + + // Clip to target size to 0 - max_bits (or cpi->gf_group_bits) at the top end. + if (target_frame_size < 0) + target_frame_size = 0; + else + { + if (target_frame_size > max_bits) + target_frame_size = max_bits; + + if (target_frame_size > cpi->gf_group_bits) + target_frame_size = cpi->gf_group_bits; + } + + cpi->gf_group_error_left -= modified_err; // Adjust error remaining + cpi->gf_group_bits -= target_frame_size; // Adjust bits remaining + + if (cpi->gf_group_bits < 0) + cpi->gf_group_bits = 0; + + target_frame_size += cpi->min_frame_bandwidth; // Add in the minimum number of bits that is set aside for every frame. + + // Special case for the frame that lies half way between two gfs + if (cpi->common.frames_since_golden == cpi->baseline_gf_interval / 2) + target_frame_size += cpi->mid_gf_extra_bits; + + cpi->per_frame_bandwidth = target_frame_size; // Per frame bit target for this frame +} + +void vp8_second_pass(VP8_COMP *cpi) +{ + int tmp_q; + int frames_left = (int)(cpi->total_stats.count - cpi->common.current_video_frame); + + FIRSTPASS_STATS this_frame; + FIRSTPASS_STATS this_frame_copy; + + VP8_COMMON *cm = &cpi->common; + + double this_frame_error; + double this_frame_intra_error; + double this_frame_coded_error; + + FIRSTPASS_STATS *start_pos; + + if (!cpi->stats_in) + { + return ; + } + + vp8_clear_system_state(); + + if (EOF == vp8_input_stats(cpi, &this_frame)) + return; + +#ifdef FIRSTPASS_MM + vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs); + cpi->fpmm_pos = vp8_fpmm_get_pos(cpi); + vp8_advance_fpmm(cpi, 1); // Read this frame's first pass motion map +#endif + + this_frame_error = this_frame.ssim_weighted_pred_err; + this_frame_intra_error = this_frame.intra_error; + this_frame_coded_error = this_frame.coded_error; + + // Store information regarding level of motion etc for use mode decisions. + cpi->motion_speed = (int)(fabs(this_frame.MVr) + fabs(this_frame.MVc)); + cpi->motion_var = (int)(fabs(this_frame.MVrv) + fabs(this_frame.MVcv)); + cpi->inter_lvl = (int)(this_frame.pcnt_inter * 100); + cpi->intra_lvl = (int)((1.0 - this_frame.pcnt_inter) * 100); + cpi->motion_lvl = (int)(this_frame.pcnt_motion * 100); + + start_pos = cpi->stats_in; + + // keyframe and section processing ! + if (cpi->frames_to_key == 0) + { + // Define next KF group and assign bits to it + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + vp8_find_next_key_frame(cpi, &this_frame_copy); + + // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop + // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups. + // This is temporary code till we decide what should really happen in this case. + if (cpi->oxcf.error_resilient_mode) + { + cpi->gf_group_bits = cpi->kf_group_bits; + cpi->gf_group_error_left = cpi->kf_group_error_left; + cpi->baseline_gf_interval = cpi->frames_to_key; + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + cpi->source_alt_ref_pending = FALSE; + } + + } + + // Is this a GF / ARF (Note that a KF is always also a GF) + if (cpi->frames_till_gf_update_due == 0) + { + // Define next gf group and assign bits to it + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + define_gf_group(cpi, &this_frame_copy); + + // If we are going to code an altref frame at the end of the group and the current frame is not a key frame.... + // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits + // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well + if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) + { + // Assign a standard frames worth of bits from those allocated to the GF group + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + + // If appropriate (we are switching into ARF active but it was not previously active) apply a boost for the gf at the start of the group. + //if ( !cpi->source_alt_ref_active && (cpi->gfu_boost > 150) ) + if (FALSE) + { + int extra_bits; + int pct_extra = (cpi->gfu_boost - 100) / 50; + + pct_extra = (pct_extra > 20) ? 20 : pct_extra; + + extra_bits = (cpi->gf_group_bits * pct_extra) / 100; + cpi->gf_group_bits -= extra_bits; + cpi->per_frame_bandwidth += extra_bits; + } + } + } + + // Otherwise this is an ordinary frame + else + { + // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop + // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups. + // This is temporary code till we decide what should really happen in this case. + if (cpi->oxcf.error_resilient_mode) + { + cpi->frames_till_gf_update_due = cpi->frames_to_key; + + if (cpi->common.frame_type != KEY_FRAME) + { + // Assign bits from those allocated to the GF group + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + } + } + else + { + // Assign bits from those allocated to the GF group + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + } + } + + // Set nominal per second bandwidth for this frame + cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate; + if (cpi->target_bandwidth < 0) + cpi->target_bandwidth = 0; + + if (cpi->common.current_video_frame == 0) + { + // guess at 2nd pass q + cpi->est_max_qcorrection_factor = 1.0; + tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width); + + if (tmp_q < cpi->worst_quality) + { + cpi->active_worst_quality = tmp_q; + cpi->ni_av_qi = tmp_q; + } + else + { + cpi->active_worst_quality = cpi->worst_quality; + cpi->ni_av_qi = cpi->worst_quality; + } + } + else + { + if (frames_left < 1) + frames_left = 1; + + tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width); + + // Move active_worst_quality but in a damped way + if (tmp_q > cpi->active_worst_quality) + cpi->active_worst_quality ++; + else if (tmp_q < cpi->active_worst_quality) + cpi->active_worst_quality --; + + cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4; + + // Clamp to user set limits + if (cpi->active_worst_quality > cpi->worst_quality) + cpi->active_worst_quality = cpi->worst_quality; + else if (cpi->active_worst_quality < cpi->best_quality) + cpi->active_worst_quality = cpi->best_quality; + + } + + cpi->frames_to_key --; + cpi->total_error_left -= this_frame_error; + cpi->total_intra_error_left -= this_frame_intra_error; + cpi->total_coded_error_left -= this_frame_coded_error; +} + + +static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) +{ + BOOL is_viable_kf = FALSE; + + // Does the frame satisfy the primary criteria of a key frame + // If so, then examine how well it predicts subsequent frames + if ((this_frame->pcnt_second_ref < 0.10) && + (next_frame->pcnt_second_ref < 0.10) && + ((this_frame->pcnt_inter < 0.05) || + ( + (this_frame->pcnt_inter < .25) && + ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) || + (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) || + ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5) + ) + ) + ) + ) + { + int i; + FIRSTPASS_STATS *start_pos; + + FIRSTPASS_STATS local_next_frame; + + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + double next_iiratio; + + vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame)); + + // Note the starting file position so we can reset to it + start_pos = cpi->stats_in; + + // Examine how well the key frame predicts subsequent frames + for (i = 0 ; i < 16; i++) + { + next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ; + + if (next_iiratio > RMAX) + next_iiratio = RMAX; + + // Cumulative effect of decay in prediction quality + if (local_next_frame.pcnt_inter > 0.85) + decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; + else + decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0); + + //decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; + + // Keep a running total + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses + if ((local_next_frame.pcnt_inter < 0.05) || + (next_iiratio < 1.5) || + ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 0.5) || + (local_next_frame.intra_error < 200) + ) + { + break; + } + + old_boost_score = boost_score; + + // Get the next frame details + if (EOF == vp8_input_stats(cpi, &local_next_frame)) + break; + } + + // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on + if (boost_score > 5.0 && (i > 3)) + is_viable_kf = TRUE; + else + { + // Reset the file position + reset_fpf_position(cpi, start_pos); + + is_viable_kf = FALSE; + } + } + + return is_viable_kf; +} +void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) +{ + int i; + FIRSTPASS_STATS last_frame; + FIRSTPASS_STATS first_frame; + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS *start_position; + + double decay_accumulator = 0; + double boost_score = 0; + double old_boost_score = 0.0; + double loop_decay_rate; + + double kf_mod_err = 0.0; + double kf_group_err = 0.0; + double kf_group_intra_err = 0.0; + double kf_group_coded_err = 0.0; + double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + + vp8_clear_system_state(); //__asm emms; + start_position = cpi->stats_in; + + cpi->common.frame_type = KEY_FRAME; + + // Clear the alt ref active flag as this can never be active on a key frame + cpi->source_alt_ref_active = FALSE; + + // Kf is always a gf so clear frames till next gf counter + cpi->frames_till_gf_update_due = 0; + + cpi->frames_to_key = 1; + + // Take a copy of the initial frame details + vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame)); + + cpi->kf_group_bits = 0; // Estimate of total bits avaialable to kf group + cpi->kf_group_error_left = 0; // Group modified error score. + + kf_mod_err = calculate_modified_err(cpi, this_frame); + + // find the next keyframe + while (cpi->stats_in < cpi->stats_in_end) + { + // Accumulate kf group error + kf_group_err += calculate_modified_err(cpi, this_frame); + + // These figures keep intra and coded error counts for all frames including key frames in the group. + // The effect of the key frame itself can be subtracted out using the first_frame data collected above + kf_group_intra_err += this_frame->intra_error; + kf_group_coded_err += this_frame->coded_error; + + vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame)); + + // Provided that we are not at the end of the file... + if (EOF != vp8_input_stats(cpi, this_frame)) + { + if (lookup_next_frame_stats(cpi, &next_frame) != EOF) + { + if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) + break; + } + } + + // Step on to the next frame + cpi->frames_to_key ++; + + // If we don't have a real key frame within the next two + // forcekeyframeevery intervals then break out of the loop. + if (cpi->frames_to_key >= 2 *(int)cpi->key_frame_frequency) + break; + + } + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural + // interval is between 1x and 2x + if ( cpi->frames_to_key > (int)cpi->key_frame_frequency ) + { + cpi->frames_to_key /= 2; + + // Estimate corrected kf group error + kf_group_err /= 2.0; + kf_group_intra_err /= 2.0; + kf_group_coded_err /= 2.0; + } + + // Special case for the last frame of the file + if (cpi->stats_in >= cpi->stats_in_end) + { + // Accumulate kf group error + kf_group_err += calculate_modified_err(cpi, this_frame); + + // These figures keep intra and coded error counts for all frames including key frames in the group. + // The effect of the key frame itself can be subtracted out using the first_frame data collected above + kf_group_intra_err += this_frame->intra_error; + kf_group_coded_err += this_frame->coded_error; + } + + // Calculate the number of bits that should be assigned to the kf group. + if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0)) + { + int max_bits = frame_max_bits(cpi); // Max for a single normal frame (not key frame) + + // Default allocation based on bits left and relative complexity of the section + cpi->kf_group_bits = (int)(cpi->bits_left * (kf_group_err / cpi->modified_total_error_left)); + + // Clip based on maximum per frame rate defined by the user. + if (cpi->kf_group_bits > max_bits * cpi->frames_to_key) + cpi->kf_group_bits = max_bits * cpi->frames_to_key; + + // Additional special case for CBR if buffer is getting full. + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + // If the buffer is near or above the optimal and this kf group is not being allocated much + // then increase the allocation a bit. + if (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) + { + int high_water_mark = (cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1; + int min_group_bits; + + // We are at or above the maximum. + if (cpi->buffer_level >= high_water_mark) + { + min_group_bits = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) + (cpi->buffer_level - high_water_mark); + + if (cpi->kf_group_bits < min_group_bits) + cpi->kf_group_bits = min_group_bits; + } + // We are above optimal but below the maximum + else if (cpi->kf_group_bits < (cpi->av_per_frame_bandwidth * cpi->frames_to_key)) + { + int bits_below_av = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) - cpi->kf_group_bits; + cpi->kf_group_bits += (int)((double)bits_below_av * (double)(cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / + (double)(high_water_mark - cpi->oxcf.optimal_buffer_level)); + } + } + } + } + else + cpi->kf_group_bits = 0; + + // Reset the first pass file position + reset_fpf_position(cpi, start_position); + + // determine how big to make this keyframe based on how well the subsequent frames use inter blocks + decay_accumulator = 1.0; + boost_score = 0.0; + loop_decay_rate = 1.00; // Starting decay rate + + for (i = 0 ; i < cpi->frames_to_key ; i++) + { + double r; + + if (EOF == vp8_input_stats(cpi, &next_frame)) + break; + + r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)) ; + + if (r > RMAX) + r = RMAX; + + // Adjust loop decay rate + //if ( next_frame.pcnt_inter < loop_decay_rate ) + loop_decay_rate = next_frame.pcnt_inter; + + if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate) + loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0)); + + decay_accumulator = decay_accumulator * loop_decay_rate; + + boost_score += (decay_accumulator * r); + + if ((i > MIN_GF_INTERVAL) && + ((boost_score - old_boost_score) < 1.0)) + { + break; + } + + old_boost_score = boost_score; + } + + if (1) + { + FIRSTPASS_STATS sectionstats; + double Ratio; + + vp8_zero_stats(§ionstats); + reset_fpf_position(cpi, start_position); + + for (i = 0 ; i < cpi->frames_to_key ; i++) + { + vp8_input_stats(cpi, &next_frame); + vp8_accumulate_stats(§ionstats, &next_frame); + } + + vp8_avg_stats(§ionstats); + + if (sectionstats.pcnt_motion < .17) + cpi->section_is_low_motion = 1; + else + cpi->section_is_low_motion = 0; + + if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45) + cpi->section_is_fast_motion = 1; + else + cpi->section_is_fast_motion = 0; + + cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + + Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) ) + //{ + cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025); + + if (cpi->section_max_qfactor < 0.80) + cpi->section_max_qfactor = 0.80; + + //} + //else + // cpi->section_max_qfactor = 1.0; + } + + // When using CBR apply additional buffer fullness related upper limits + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + double max_boost; + + if (cpi->drop_frames_allowed) + { + int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100); + + if (cpi->buffer_level > df_buffer_level) + max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + else + max_boost = 0.0; + } + else if (cpi->buffer_level > 0) + { + max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } + else + { + max_boost = 0.0; + } + + if (boost_score > max_boost) + boost_score = max_boost; + } + + // Reset the first pass file position + reset_fpf_position(cpi, start_position); + + // Work out how many bits to allocate for the key frame itself + if (1) + { + int kf_boost = boost_score; + int allocation_chunks; + int Counter = cpi->frames_to_key; + int alt_kf_bits; + + // Min boost based on kf interval +#if 0 + + while ((kf_boost < 48) && (Counter > 0)) + { + Counter -= 2; + kf_boost ++; + } + +#endif + + if (kf_boost < 48) + { + kf_boost += ((Counter + 1) >> 1); + + if (kf_boost > 48) kf_boost = 48; + } + + // bigger frame sizes need larger kf boosts, smaller frames smaller boosts... + if ((cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) > (320 * 240)) + kf_boost += 2 * (cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) / (320 * 240); + else if ((cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) < (320 * 240)) + kf_boost -= 4 * (320 * 240) / (cpi->common.last_frame.y_width * cpi->common.last_frame.y_height); + + kf_boost = (int)((double)kf_boost * 100.0) >> 4; // Scale 16 to 100 + + // Adjustment to boost based on recent average q + kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100; + + if (kf_boost < 250) // Min KF boost + kf_boost = 250; + + // We do three calculations for kf size. + // The first is based on the error score for the whole kf group. + // The second (optionaly) on the key frames own error if this is smaller than the average for the group. + // The final one insures that the frame receives at least the allocation it would have received based on its own error score vs the error score remaining + + allocation_chunks = ((cpi->frames_to_key - 1) * 100) + kf_boost; // cpi->frames_to_key-1 because key frame itself is taken care of by kf_boost + + // Normalize Altboost and allocations chunck down to prevent overflow + while (kf_boost > 1000) + { + kf_boost /= 2; + allocation_chunks /= 2; + } + + cpi->kf_group_bits = (cpi->kf_group_bits < 0) ? 0 : cpi->kf_group_bits; + + // Calculate the number of bits to be spent on the key frame + cpi->kf_bits = (int)((double)kf_boost * ((double)cpi->kf_group_bits / (double)allocation_chunks)); + + // Apply an additional limit for CBR + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + if (cpi->kf_bits > ((3 * cpi->buffer_level) >> 2)) + cpi->kf_bits = (3 * cpi->buffer_level) >> 2; + } + + // If the key frame is actually easier than the average for the kf group (which does sometimes happen... eg a blank intro frame) + // Then use an alternate calculation based on the kf error score which should give a smaller key frame. + if (kf_mod_err < kf_group_err / cpi->frames_to_key) + { + double alt_kf_grp_bits = ((double)cpi->bits_left * (kf_mod_err * (double)cpi->frames_to_key) / cpi->modified_total_error_left) ; + + alt_kf_bits = (int)((double)kf_boost * (alt_kf_grp_bits / (double)allocation_chunks)); + + if (cpi->kf_bits > alt_kf_bits) + { + cpi->kf_bits = alt_kf_bits; + } + } + // Else if it is much harder than other frames in the group make sure it at least receives an allocation in keeping with its relative error score + else + { + alt_kf_bits = (int)((double)cpi->bits_left * (kf_mod_err / cpi->modified_total_error_left)); + + if (alt_kf_bits > cpi->kf_bits) + { + cpi->kf_bits = alt_kf_bits; + } + } + + cpi->kf_group_bits -= cpi->kf_bits; + cpi->kf_bits += cpi->min_frame_bandwidth; // Add in the minimum frame allowance + + cpi->per_frame_bandwidth = cpi->kf_bits; // Peer frame bit target for this frame + cpi->target_bandwidth = cpi->kf_bits * cpi->output_frame_rate; // Convert to a per second bitrate + } + + // Note the total error score of the kf group minus the key frame itself + cpi->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame sizes + cpi->modified_total_error_left -= kf_group_err; + + if (cpi->oxcf.allow_spatial_resampling) + { + int resample_trigger = FALSE; + int last_kf_resampled = FALSE; + int kf_q; + int scale_val = 0; + int hr, hs, vr, vs; + int new_width = cpi->oxcf.Width; + int new_height = cpi->oxcf.Height; + + int projected_buffer_level = cpi->buffer_level; + int tmp_q; + + double projected_bits_perframe; + double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error); + double err_per_frame = kf_group_err / cpi->frames_to_key; + double bits_per_frame; + double av_bits_per_frame; + double effective_size_ratio; + + if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height)) + last_kf_resampled = TRUE; + + // Set back to unscaled by defaults + cpi->common.horiz_scale = NORMAL; + cpi->common.vert_scale = NORMAL; + + // Calculate Average bits per frame. + //av_bits_per_frame = cpi->bits_left/(double)(cpi->total_stats.count - cpi->common.current_video_frame); + av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate); + //if ( av_bits_per_frame < 0.0 ) + // av_bits_per_frame = 0.0 + + // CBR... Use the clip average as the target for deciding resample + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + bits_per_frame = av_bits_per_frame; + } + + // In VBR we want to avoid downsampling in easy section unless we are under extreme pressure + // So use the larger of target bitrate for this sectoion or average bitrate for sequence + else + { + bits_per_frame = cpi->kf_group_bits / cpi->frames_to_key; // This accounts for how hard the section is... + + if (bits_per_frame < av_bits_per_frame) // Dont turn to resampling in easy sections just because they have been assigned a small number of bits + bits_per_frame = av_bits_per_frame; + } + + // bits_per_frame should comply with our minimum + if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100)) + bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + + // Work out if spatial resampling is necessary + kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio); + + // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section + projected_bits_perframe = bits_per_frame; + tmp_q = kf_q; + + while (tmp_q > cpi->worst_quality) + { + projected_bits_perframe *= 1.04; + tmp_q--; + } + + // Guess at buffer level at the end of the section + projected_buffer_level = cpi->buffer_level - (int)((projected_bits_perframe - av_bits_per_frame) * cpi->frames_to_key); + + if (0) + { + FILE *f = fopen("Subsamle.stt", "a"); + fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width); + fclose(f); + } + + // The trigger for spatial resampling depends on the various parameters such as whether we are streaming (CBR) or VBR. + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + // Trigger resample if we are projected to fall below down sample level or + // resampled last time and are projected to remain below the up sample level + if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) || + (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)))) + //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) && + // ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) )) + resample_trigger = TRUE; + else + resample_trigger = FALSE; + } + else + { + long long clip_bits = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate)); + long long over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level; + long long over_spend2 = cpi->oxcf.starting_buffer_level - projected_buffer_level; + + if ((last_kf_resampled && (kf_q > cpi->worst_quality)) || // If triggered last time the threshold for triggering again is reduced + ((kf_q > cpi->worst_quality) && // Projected Q higher than allowed and ... + (over_spend > clip_bits / 20))) // ... Overspend > 5% of total bits + resample_trigger = TRUE; + else + resample_trigger = FALSE; + + } + + if (resample_trigger) + { + while ((kf_q >= cpi->worst_quality) && (scale_val < 6)) + { + scale_val ++; + + cpi->common.vert_scale = vscale_lookup[scale_val]; + cpi->common.horiz_scale = hscale_lookup[scale_val]; + + Scale2Ratio(cpi->common.horiz_scale, &hr, &hs); + Scale2Ratio(cpi->common.vert_scale, &vr, &vs); + + new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; + new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs; + + // Reducing the area to 1/4 does not reduce the complexity (err_per_frame) to 1/4... + // effective_sizeratio attempts to provide a crude correction for this + effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height); + effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0; + + // Now try again and see what Q we get with the smaller image size + kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio); + + if (0) + { + FILE *f = fopen("Subsamle.stt", "a"); + fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width); + fclose(f); + } + } + } + + if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height)) + { + cpi->common.Width = new_width; + cpi->common.Height = new_height; + vp8_alloc_compressor_data(cpi); + } + } +} diff --git a/vp8/encoder/firstpass.h b/vp8/encoder/firstpass.h new file mode 100644 index 000000000..d7b52f3f3 --- /dev/null +++ b/vp8/encoder/firstpass.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if !defined __INC_FIRSTPASS_H +#define __INC_FIRSTPASS_H + +extern void vp8_init_first_pass(VP8_COMP *cpi); +extern void vp8_first_pass(VP8_COMP *cpi); +extern void vp8_end_first_pass(VP8_COMP *cpi); + +extern void vp8_init_second_pass(VP8_COMP *cpi); +extern void vp8_second_pass(VP8_COMP *cpi); +extern void vp8_end_second_pass(VP8_COMP *cpi); + +#endif diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c new file mode 100644 index 000000000..52aab6642 --- /dev/null +++ b/vp8/encoder/generic/csystemdependent.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "variance.h" +#include "onyx_int.h" + + +void vp8_arch_x86_encoder_init(VP8_COMP *cpi); + + +void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); +extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); + +void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); + +void vp8_cmachine_specific_config(VP8_COMP *cpi) +{ +#if CONFIG_RUNTIME_CPU_DETECT + cpi->rtcd.common = &cpi->common.rtcd; + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c; + + cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_c; + cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_c; + cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_c; + cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c; + cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c; + + cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c; + cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c; + cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c; + cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_c; + cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_c; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_c; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_c; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c; + cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_c; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_c; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; + + cpi->rtcd.encodemb.berr = vp8_block_error_c; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c; + cpi->rtcd.encodemb.subb = vp8_subtract_b_c; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_c; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c; + + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; + + cpi->rtcd.search.full_search = vp8_full_search_sad; + cpi->rtcd.search.diamond_search = vp8_diamond_search_sad; +#endif + + // Pure C: + vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; + + +#if ARCH_X86 || ARCH_X86_64 + vp8_arch_x86_encoder_init(cpi); +#endif + +} diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c new file mode 100644 index 000000000..d80059d37 --- /dev/null +++ b/vp8/encoder/mcomp.c @@ -0,0 +1,1467 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "mcomp.h" +#include "vpx_mem/vpx_mem.h" + +#include <stdio.h> +#include <limits.h> +#include <math.h> + +#ifdef ENTROPY_STATS +static int mv_ref_ct [31] [4] [2]; +static int mv_mode_cts [4] [2]; +#endif + +static int mv_bits_sadcost[256]; + +void vp8cx_init_mv_bits_sadcost() +{ + int i; + + for (i = 0; i < 256; i++) + { + mv_bits_sadcost[i] = (int)sqrt(i * 16); + } +} + + +int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight) +{ + // MV costing is based on the distribution of vectors in the previous frame and as such will tend to + // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the + // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks. + // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors. + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7; +} + +int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit) +{ + //int i; + //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8; + //return ( (vp8_mv_bit_cost(mv, ref, mvcost, 100) + 128) * error_per_bit) >> 8; + + //i = (vp8_mv_bit_cost(mv, ref, mvcost, 100) * error_per_bit + 128) >> 8; + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8; + //return (vp8_mv_bit_cost(mv, ref, mvcost, 128) * error_per_bit + 128) >> 8; +} + + +static int mv_bits(MV *mv, MV *ref, int *mvcost[2]) +{ + // get the estimated number of bits for a motion vector, to be used for costing in SAD based + // motion estimation + return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8; +} + +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride) +{ + int Len; + int search_site_count = 0; + + + // Generate offsets for 4 search sites per step. + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) + { + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + // Contract. + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 4; +} + +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) +{ + int Len; + int search_site_count = 0; + + // Generate offsets for 8 search sites per step. + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) + { + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride - Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride + Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride - Len; + search_site_count++; + + // Compute offsets for search sites. + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride + Len; + search_site_count++; + + + // Contract. + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 8; +} + + +#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) +#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector +#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc +#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. +#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; +#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost +#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best +#define MIN(x,y) (((x)<(y))?(x):(y)) +#define MAX(x,y) (((x)>(y))?(x):(y)) + +//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; } + +int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + + int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1; + int br = bestmv->row << 2, bc = bestmv->col << 2; + int tr = br, tc = bc; + unsigned int besterr = INT_MAX; + unsigned int left, right, up, down, diag; + unsigned int sse; + unsigned int whichdir; + unsigned int halfiters = 4; + unsigned int quarteriters = 4; + + int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1)); + int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1)); + int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1)); + int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1)); + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + + // calculate central point error + besterr = vf(y, d->pre_stride, z, b->src_stride, &sse); + besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected) + while (--halfiters) + { + // 1/2 pel + CHECK_BETTER(left, tr, tc - 2); + CHECK_BETTER(right, tr, tc + 2); + CHECK_BETTER(up, tr - 2, tc); + CHECK_BETTER(down, tr + 2, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) + { + case 0: + CHECK_BETTER(diag, tr - 2, tc - 2); + break; + case 1: + CHECK_BETTER(diag, tr - 2, tc + 2); + break; + case 2: + CHECK_BETTER(diag, tr + 2, tc - 2); + break; + case 3: + CHECK_BETTER(diag, tr + 2, tc + 2); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected) + // 1/4 pel + while (--quarteriters) + { + CHECK_BETTER(left, tr, tc - 1); + CHECK_BETTER(right, tr, tc + 1); + CHECK_BETTER(up, tr - 1, tc); + CHECK_BETTER(down, tr + 1, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) + { + case 0: + CHECK_BETTER(diag, tr - 1, tc - 1); + break; + case 1: + CHECK_BETTER(diag, tr - 1, tc + 1); + break; + case 2: + CHECK_BETTER(diag, tr + 1, tc - 1); + break; + case 3: + CHECK_BETTER(diag, tr + 1, tc + 1); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + bestmv->row = br << 1; + bestmv->col = bc << 1; + + if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL)) + return INT_MAX; + + return besterr; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef ERR +#undef CHECK_BETTER +#undef MIN +#undef MAX +int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + int bestmse = INT_MAX; + MV startmv; + //MV this_mv; + MV this_mv; + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + int whichdir ; + + + // Trap uncodable vectors + if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL)) + { + bestmv->row <<= 3; + bestmv->col <<= 3; + return INT_MAX; + } + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + startmv = *bestmv; + + // calculate central point error + bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // go left then right and check error + this_mv.row = startmv.row; + this_mv.col = ((startmv.col - 8) | 4); + left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse); + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 8; + right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + this_mv.row = ((startmv.row - 8) | 4); + up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse); + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 8; + down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + + // now check 1 more diagonal + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + // whichdir must be 0-4. Therefore, one of the cases below + // must run through. However, because there is no default + // and diag is not set elsewhere, we get a compile warning + diag = 0; + //for(whichdir =0;whichdir<4;whichdir++) + //{ + this_mv = startmv; + + switch (whichdir) + { + case 0: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 1: + this_mv.col += 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 2: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row += 4; + diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 3: + this_mv.col += 4; + this_mv.row += 4; + diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +// } + + + // time to check quarter pels. + if (bestmv->row < startmv.row) + y -= d->pre_stride; + + if (bestmv->col < startmv.col) + y--; + + startmv = *bestmv; + + + + // go left then right and check error + this_mv.row = startmv.row; + + if (startmv.col & 7) + { + this_mv.col = startmv.col - 2; + left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse); + } + + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 4; + right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + + if (startmv.row & 7) + { + this_mv.row = startmv.row - 2; + up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.row = (startmv.row - 8) | 6; + up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 4; + down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + + // now check 1 more diagonal + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + +// for(whichdir=0;whichdir<4;whichdir++) +// { + this_mv = startmv; + + switch (whichdir) + { + case 0: + + if (startmv.row & 7) + { + this_mv.row -= 2; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + } + } + else + { + this_mv.row = (startmv.row - 8) | 6; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse); + } + } + + break; + case 1: + this_mv.col += 2; + + if (startmv.row & 7) + { + this_mv.row -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.row = (startmv.row - 8) | 6; + diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + } + + break; + case 2: + this_mv.row += 2; + + if (startmv.col & 7) + { + this_mv.col -= 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + } + else + { + this_mv.col = (startmv.col - 8) | 6; + diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + } + + break; + case 3: + this_mv.col += 2; + this_mv.row += 2; + diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +// } + + return bestmse; +} + +int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + int bestmse = INT_MAX; + MV startmv; + //MV this_mv; + MV this_mv; + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + + // Trap uncodable vectors + if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL)) + { + bestmv->row <<= 3; + bestmv->col <<= 3; + return INT_MAX; + } + + // central mv + bestmv->row <<= 3; + bestmv->col <<= 3; + startmv = *bestmv; + + // calculate central point error + bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + // go left then right and check error + this_mv.row = startmv.row; + this_mv.col = ((startmv.col - 8) | 4); + left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse); + left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) + { + *bestmv = this_mv; + bestmse = left; + } + + this_mv.col += 8; + right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse); + right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) + { + *bestmv = this_mv; + bestmse = right; + } + + // go up then down and check error + this_mv.col = startmv.col; + this_mv.row = ((startmv.row - 8) | 4); + up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse); + up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) + { + *bestmv = this_mv; + bestmse = up; + } + + this_mv.row += 8; + down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse); + down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) + { + *bestmv = this_mv; + bestmse = down; + } + + // somewhat strangely not doing all the diagonals for half pel is slower than doing them. +#if 0 + // now check 1 more diagonal - + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + this_mv = startmv; + + switch (whichdir) + { + case 0: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 1: + this_mv.col += 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 2: + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row += 4; + diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + case 3: + this_mv.col += 4; + this_mv.row += 4; + diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse); + break; + } + + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +#else + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = (this_mv.row - 8) | 4; + diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col += 8; + diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col = (this_mv.col - 8) | 4; + this_mv.row = startmv.row + 4; + diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + + this_mv.col += 8; + diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) + { + *bestmv = this_mv; + bestmse = diag; + } + +#endif + return bestmse; +} + + +#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) +#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector +#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score. +#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost +#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best + +int vp8_hex_search +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_t vf, + vp8_sad_fn_t sf, + int *mvsadcost[2], + int *mvcost[2] +) +{ + MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ; + MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ; + int i, j; + unsigned char *src = (*(b->base_src) + b->src); + int src_stride = b->src_stride; + int rr = ref_mv->row, rc = ref_mv->col, br = rr, bc = rc, tr, tc; + unsigned int besterr, thiserr = 0x7fffffff; + + if (rc < x->mv_col_min) bc = x->mv_col_min; + + if (rc > x->mv_col_max) bc = x->mv_col_max; + + if (rr < x->mv_row_min) br = x->mv_row_min; + + if (rr > x->mv_row_max) br = x->mv_row_max; + + rr >>= 1; + rc >>= 1; + br >>= 3; + bc >>= 3; + + besterr = ERR(br, bc, thiserr); + + // hex search jbb changed to 127 to avoid max 256 problem steping by 2. + for (j = 0; j < 127; j++) + { + tr = br; + tc = bc; + + for (i = 0; i < 6; i++) + { + int nr = tr + hex[i].row, nc = tc + hex[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + if (tr == br && tc == bc) + break; + } + + // check 8 1 away neighbors + tr = br; + tc = bc; + + for (i = 0; i < 8; i++) + { + int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col; + + if (nc < x->mv_col_min) continue; + + if (nc > x->mv_col_max) continue; + + if (nr < x->mv_row_min) continue; + + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + best_mv->row = br; + best_mv->col = bc; + + return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef ERR +#undef CHECK_BETTER +int vp8_diamond_search_sad +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvsadcost[2], + int *mvcost[2] +) +{ + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + unsigned char *best_address; + + int tot_steps; + MV this_mv; + + int bestsad = INT_MAX; + int best_site = 0; + int last_site = 0; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + int thissad; + + // Work out the start point for the search + in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col); + best_address = in_what; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // search_param determines the length of the initial step and hence the number of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + best_mv->row = ref_row; + best_mv->col = ref_col; + + *num00 = 0; + + for (step = 0; step < tot_steps ; step++) + { + for (j = 0 ; j < x->searches_per_step ; j++) + { + // Trap illegal vectors + this_row_offset = best_mv->row + ss[i].mv.row; + this_col_offset = best_mv->col + ss[i].mv.col; + + if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) + + { + check_here = ss[i].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_mv.row = this_row_offset << 3; + this_mv.col = this_col_offset << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + + i++; + } + + if (best_site != last_site) + { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } + else if (best_address == in_what) + (*num00)++; + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad == INT_MAX) + return INT_MAX; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); +} + +int vp8_diamond_search_sadx4 +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvsadcost[2], + int *mvcost[2] +) +{ + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + unsigned char *best_address; + + int tot_steps; + MV this_mv; + + unsigned int bestsad = UINT_MAX; + int best_site = 0; + int last_site = 0; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + unsigned int thissad; + + // Work out the start point for the search + in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col); + best_address = in_what; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // search_param determines the length of the initial step and hence the number of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + best_mv->row = ref_row; + best_mv->col = ref_col; + + *num00 = 0; + + for (step = 0; step < tot_steps ; step++) + { + int check_row_min, check_col_min, check_row_max, check_col_max; + + check_row_min = x->mv_row_min - best_mv->row; + check_row_max = x->mv_row_max - best_mv->row; + check_col_min = x->mv_col_min - best_mv->col; + check_col_max = x->mv_col_max - best_mv->col; + + for (j = 0 ; j < x->searches_per_step ; j += 4) + { + unsigned char *block_offset[4]; + unsigned int valid_block[4]; + int all_in = 1, t; + + for (t = 0; t < 4; t++) + { + valid_block [t] = (ss[t+i].mv.col > check_col_min); + valid_block [t] &= (ss[t+i].mv.col < check_col_max); + valid_block [t] &= (ss[t+i].mv.row > check_row_min); + valid_block [t] &= (ss[t+i].mv.row < check_row_max); + + all_in &= valid_block[t]; + block_offset[t] = ss[i+t].offset + best_address; + } + + if (all_in) + { + unsigned int sad_array[4]; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array); + + for (t = 0; t < 4; t++, i++) + { + thissad = sad_array[t]; + + if (thissad < bestsad) + { + this_mv.row = (best_mv->row + ss[i].mv.row) << 3; + this_mv.col = (best_mv->col + ss[i].mv.col) << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + } + else + { + int t; + + for (t = 0; t < 4; i++, t++) + { + // Trap illegal vectors + if (valid_block[t]) + + { + check_here = block_offset[t]; + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_row_offset = best_mv->row + ss[i].mv.row; + this_col_offset = best_mv->col + ss[i].mv.col; + + this_mv.row = this_row_offset << 3; + this_mv.col = this_col_offset << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_site = i; + } + } + } + } + } + } + + if (best_site != last_site) + { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } + else if (best_address == in_what) + (*num00)++; + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad == INT_MAX) + return INT_MAX; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); +} + + +#if !(CONFIG_REALTIME_ONLY) +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2]) +{ + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + int mv_stride = d->pre_stride; + unsigned char *bestaddress; + MV *best_mv = &d->bmi.mv.as_mv; + MV this_mv; + int bestsad = INT_MAX; + int r, c; + + unsigned char *check_here; + int thissad; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + // Work out the mid point for the search + in_what = *(d->base_pre) + d->pre; + bestaddress = in_what + (ref_row * d->pre_stride) + ref_col; + + best_mv->row = ref_row; + best_mv->col = ref_col; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Baseline value at the centre + + //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14)); + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border + if (col_min < x->mv_col_min) + col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) + col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) + row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) + row_max = x->mv_row_max; + + for (r = row_min; r < row_max ; r++) + { + this_mv.row = r << 3; + check_here = r * mv_stride + in_what + col_min; + + for (c = col_min; c < col_max; c++) + { + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + this_mv.col = c << 3; + //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14)); + //thissad += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)]; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + + check_here++; + } + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad < INT_MAX) + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + else + return INT_MAX; +} + +int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2]) +{ + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + int mv_stride = d->pre_stride; + unsigned char *bestaddress; + MV *best_mv = &d->bmi.mv.as_mv; + MV this_mv; + unsigned int bestsad = UINT_MAX; + int r, c; + + unsigned char *check_here; + unsigned int thissad; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + unsigned int sad_array[3]; + + // Work out the mid point for the search + in_what = *(d->base_pre) + d->pre; + bestaddress = in_what + (ref_row * d->pre_stride) + ref_col; + + best_mv->row = ref_row; + best_mv->col = ref_col; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Baseline value at the centre + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border + if (col_min < x->mv_col_min) + col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) + col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) + row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) + row_max = x->mv_row_max; + + for (r = row_min; r < row_max ; r++) + { + this_mv.row = r << 3; + check_here = r * mv_stride + in_what + col_min; + c = col_min; + + while ((c + 3) < col_max) + { + int i; + + fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array); + + for (i = 0; i < 3; i++) + { + thissad = sad_array[i]; + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here++; + c++; + } + } + + while (c < col_max) + { + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here ++; + c ++; + } + + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad < INT_MAX) + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + else + return INT_MAX; +} +#endif + +#ifdef ENTROPY_STATS +void print_mode_context(void) +{ + FILE *f = fopen("modecont.c", "w"); + int i, j; + + fprintf(f, "#include \"entropy.h\"\n"); + fprintf(f, "const int vp8_mode_contexts[6][4] =\n"); + fprintf(f, "{\n"); + + for (j = 0; j < 6; j++) + { + fprintf(f, " { // %d \n", j); + fprintf(f, " "); + + for (i = 0; i < 4; i++) + { + int overal_prob; + int this_prob; + int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1]; + + // Overall probs + count = mv_mode_cts[i][0] + mv_mode_cts[i][1]; + + if (count) + overal_prob = 256 * mv_mode_cts[i][0] / count; + else + overal_prob = 128; + + if (overal_prob == 0) + overal_prob = 1; + + // context probs + count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; + + if (count) + this_prob = 256 * mv_ref_ct[j][i][0] / count; + else + this_prob = 128; + + if (this_prob == 0) + this_prob = 1; + + fprintf(f, "%5d, ", this_prob); + //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob); + //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob); + } + + fprintf(f, " },\n"); + } + + fprintf(f, "};\n"); + fclose(f); +} + +/* MV ref count ENTROPY_STATS stats code */ +#ifdef ENTROPY_STATS +void init_mv_ref_counts() +{ + vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); + vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); +} + +void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) +{ + if (m == ZEROMV) + { + ++mv_ref_ct [ct[0]] [0] [0]; + ++mv_mode_cts[0][0]; + } + else + { + ++mv_ref_ct [ct[0]] [0] [1]; + ++mv_mode_cts[0][1]; + + if (m == NEARESTMV) + { + ++mv_ref_ct [ct[1]] [1] [0]; + ++mv_mode_cts[1][0]; + } + else + { + ++mv_ref_ct [ct[1]] [1] [1]; + ++mv_mode_cts[1][1]; + + if (m == NEARMV) + { + ++mv_ref_ct [ct[2]] [2] [0]; + ++mv_mode_cts[2][0]; + } + else + { + ++mv_ref_ct [ct[2]] [2] [1]; + ++mv_mode_cts[2][1]; + + if (m == NEWMV) + { + ++mv_ref_ct [ct[3]] [3] [0]; + ++mv_mode_cts[3][0]; + } + else + { + ++mv_ref_ct [ct[3]] [3] [1]; + ++mv_mode_cts[3][1]; + } + } + } + } +} + +#endif/* END MV ref count ENTROPY_STATS stats code */ + +#endif diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h new file mode 100644 index 000000000..921206fec --- /dev/null +++ b/vp8/encoder/mcomp.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_MCOMP_H +#define __INC_MCOMP_H + +#include "block.h" +#include "variance.h" + +#ifdef ENTROPY_STATS +extern void init_mv_ref_counts(); +extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); +#endif + + +#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8) // Max full pel mv specified in 1/8 pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units + + +extern void print_mode_context(void); +extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight); +extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); +extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); + + +extern int vp8_hex_search +( + MACROBLOCK *x, + BLOCK *b, + BLOCKD *d, + MV *ref_mv, + MV *best_mv, + int search_param, + int error_per_bit, + int *num00, + vp8_variance_fn_t vf, + vp8_sad_fn_t sf, + int *mvsadcost[2], + int *mvcost[2] + +); + +typedef int (fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]); +extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; +extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; +extern fractional_mv_step_fp vp8_find_best_half_pixel_step; +extern fractional_mv_step_fp vp8_skip_fractional_mv_step; + +#define prototype_full_search_sad(sym)\ + int (sym)\ + (\ + MACROBLOCK *x, \ + BLOCK *b, \ + BLOCKD *d, \ + MV *ref_mv, \ + int error_per_bit, \ + int distance, \ + vp8_variance_fn_ptr_t *fn_ptr, \ + int *mvcost[2], \ + int *mvsadcost[2] \ + ) + +#define prototype_diamond_search_sad(sym)\ + int (sym)\ + (\ + MACROBLOCK *x, \ + BLOCK *b, \ + BLOCKD *d, \ + MV *ref_mv, \ + MV *best_mv, \ + int search_param, \ + int error_per_bit, \ + int *num00, \ + vp8_variance_fn_ptr_t *fn_ptr, \ + int *mvsadcost[2], \ + int *mvcost[2] \ + ) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/mcomp_x86.h" +#endif + +typedef prototype_full_search_sad(*vp8_full_search_fn_t); +extern prototype_full_search_sad(vp8_full_search_sad); +extern prototype_full_search_sad(vp8_full_search_sadx3); + +typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t); +extern prototype_diamond_search_sad(vp8_diamond_search_sad); +extern prototype_diamond_search_sad(vp8_diamond_search_sadx4); + +#ifndef vp8_search_full_search +#define vp8_search_full_search vp8_full_search_sad +#endif +extern prototype_full_search_sad(vp8_search_full_search); + +#ifndef vp8_search_diamond_search +#define vp8_search_diamond_search vp8_diamond_search_sad +#endif +extern prototype_diamond_search_sad(vp8_search_diamond_search); + +typedef struct +{ + prototype_full_search_sad(*full_search); + prototype_diamond_search_sad(*diamond_search); +} vp8_search_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define SEARCH_INVOKE(ctx,fn) (ctx)->fn +#else +#define SEARCH_INVOKE(ctx,fn) vp8_search_##fn +#endif + +#endif diff --git a/vp8/encoder/modecosts.c b/vp8/encoder/modecosts.c new file mode 100644 index 000000000..73170cf52 --- /dev/null +++ b/vp8/encoder/modecosts.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "blockd.h" +#include "onyx_int.h" +#include "treewriter.h" +#include "entropymode.h" + + +void vp8_init_mode_costs(VP8_COMP *c) +{ + VP8_COMMON *x = &c->common; + { + const vp8_tree_p T = vp8_bmode_tree; + + int i = 0; + + do + { + int j = 0; + + do + { + vp8_cost_tokens((int *)c->mb.bmode_costs[i][j], x->kf_bmode_prob[i][j], T); + } + while (++j < VP8_BINTRAMODES); + } + while (++i < VP8_BINTRAMODES); + + vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T); + } + vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.sub_mv_ref_prob, vp8_sub_mv_ref_tree); + + vp8_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree); + vp8_cost_tokens(c->mb.mbmode_cost[0], x->kf_ymode_prob, vp8_kf_ymode_tree); + + vp8_cost_tokens(c->mb.intra_uv_mode_cost[1], x->fc.uv_mode_prob, vp8_uv_mode_tree); + vp8_cost_tokens(c->mb.intra_uv_mode_cost[0], x->kf_uv_mode_prob, vp8_uv_mode_tree); +} diff --git a/vp8/encoder/modecosts.h b/vp8/encoder/modecosts.h new file mode 100644 index 000000000..5ade26566 --- /dev/null +++ b/vp8/encoder/modecosts.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_MODECOSTS_H +#define __INC_MODECOSTS_H + +void vp8_init_mode_costs(VP8_COMP *x); + +#endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c new file mode 100644 index 000000000..7662720c3 --- /dev/null +++ b/vp8/encoder/onyx_if.c @@ -0,0 +1,5428 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "onyxc_int.h" +#include "onyx_int.h" +#include "systemdependent.h" +#include "quantize.h" +#include "alloccommon.h" +#include "mcomp.h" +#include "firstpass.h" +#include "psnr.h" +#include "vpx_scale/vpxscale.h" +#include "extend.h" +#include "ratectrl.h" +#include "quant_common.h" +#include "segmentation_common.h" +#include "g_common.h" +#include "vpx_scale/yv12extend.h" +#include "postproc.h" +#include "vpx_mem/vpx_mem.h" +#include "swapyv12buffer.h" +#include "threading.h" +#include "vpx_ports/vpx_timer.h" +#include <math.h> +#include <stdio.h> +#include <limits.h> + +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#define RTCD(x) &cpi->common.rtcd.x +#else +#define IF_RTCD(x) NULL +#define RTCD(x) NULL +#endif + +extern void vp8cx_init_mv_bits_sadcost(); +extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); +extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val); +extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); + +extern void vp8_init_loop_filter(VP8_COMMON *cm); +extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); +extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val, int sharpness_lvl); +extern void vp8_dmachine_specific_config(VP8_COMP *cpi); +extern void vp8_cmachine_specific_config(VP8_COMP *cpi); +extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi); +extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag); +extern void print_parms(VP8_CONFIG *ocf, char *filenam); +extern unsigned int vp8_get_processor_freq(); +extern void print_tree_update_probs(); +extern void vp8cx_create_encoder_threads(VP8_COMP *cpi); +extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi); +#if HAVE_ARMV7 +extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +#endif + +int vp8_estimate_entropy_savings(VP8_COMP *cpi); +int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd); +int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd); + + +static void mode_ref_lf_test_function(VP8_COMP *cpi); + +extern const int vp8_gf_interval_table[101]; + +#if CONFIG_PSNR +#include "math.h" + +extern double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight +); + +extern double vp8_calc_ssimg +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + double *ssim_y, + double *ssim_u, + double *ssim_v +); + + +#endif + + +#ifdef OUTPUT_YUV_SRC +FILE *yuv_file; +#endif + +#if 0 +FILE *framepsnr; +FILE *kf_list; +FILE *keyfile; +#endif + +#if 0 +extern int skip_true_count; +extern int skip_false_count; +#endif + + +#ifdef ENTROPY_STATS +extern int intra_mode_stats[10][10][10]; +#endif + +#ifdef SPEEDSTATS +unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +unsigned int tot_pm = 0; +unsigned int cnt_pm = 0; +unsigned int tot_ef = 0; +unsigned int cnt_ef = 0; +#endif + +#ifdef MODE_STATS +extern unsigned __int64 Sectionbits[50]; +extern int y_modes[5] ; +extern int uv_modes[4] ; +extern int b_modes[10] ; + +extern int inter_y_modes[10] ; +extern int inter_uv_modes[4] ; +extern unsigned int inter_b_modes[15]; +#endif + +extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); +extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); +extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); + +extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; + +extern const int qrounding_factors[129]; +extern const int qzbin_factors[129]; +extern void vp8cx_init_quantizer(VP8_COMP *cpi); +extern const int vp8cx_base_skip_false_prob[128]; + + +void vp8_initialize() +{ + static int init_done = 0; + + if (!init_done) + { + vp8_scale_machine_specific_config(); + vp8_initialize_common(); + //vp8_dmachine_specific_config(); + vp8_tokenize_initialize(); + + vp8cx_init_mv_bits_sadcost(); + init_done = 1; + } +} +#ifdef PACKET_TESTING +extern FILE *vpxlogc; +#endif + +static void setup_features(VP8_COMP *cpi) +{ + // Set up default state for MB feature flags + cpi->mb.e_mbd.segmentation_enabled = 0; + cpi->mb.e_mbd.update_mb_segmentation_map = 0; + cpi->mb.e_mbd.update_mb_segmentation_data = 0; + vpx_memset(cpi->mb.e_mbd.mb_segment_tree_probs, 255, sizeof(cpi->mb.e_mbd.mb_segment_tree_probs)); + vpx_memset(cpi->mb.e_mbd.segment_feature_data, 0, sizeof(cpi->mb.e_mbd.segment_feature_data)); + + cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; + vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas)); + vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + + // jbb trial ! + mode_ref_lf_test_function(cpi); + +} + + +void vp8_dealloc_compressor_data(VP8_COMP *cpi) +{ + + // Delete sementation map + if (cpi->segmentation_map != 0) + vpx_free(cpi->segmentation_map); + + cpi->segmentation_map = 0; + + if (cpi->active_map != 0) + vpx_free(cpi->active_map); + + cpi->active_map = 0; + + // Delete first pass motion map + if (cpi->fp_motion_map != 0) + vpx_free(cpi->fp_motion_map); + + cpi->fp_motion_map = 0; + + vp8_de_alloc_frame_buffers(&cpi->common); + + vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); + vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source); +#if VP8_TEMPORAL_ALT_REF + vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer); +#endif + { + int i; + + for (i = 0; i < MAX_LAG_BUFFERS; i++) + vp8_yv12_de_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer); + + cpi->source_buffer_count = 0; + } + + vpx_free(cpi->tok); + cpi->tok = 0; + +} + +static void enable_segmentation(VP8_PTR ptr) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + // Set the appropriate feature bit + cpi->mb.e_mbd.segmentation_enabled = 1; + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; +} +static void disable_segmentation(VP8_PTR ptr) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + // Clear the appropriate feature bit + cpi->mb.e_mbd.segmentation_enabled = 0; +} + +// Valid values for a segment are 0 to 3 +// Segmentation map is arrange as [Rows][Columns] +static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + // Copy in the new segmentation map + vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols)); + + // Signal that the map should be updated. + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; +} + +// The values given for each segment can be either deltas (from the default value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q) , (0-63 for SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q) , (+/-63 for SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given). +// +// +static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta; + vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data)); +} + + +static void segmentation_test_function(VP8_PTR ptr) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + unsigned char *seg_map; + signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + int i, j; + + // Create a temporary map for segmentation data. + CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + + // MB loop to set local segmentation map + /*for ( i = 0; i < cpi->common.mb_rows; i++ ) + { + for ( j = 0; j < cpi->common.mb_cols; j++ ) + { + //seg_map[(i*cpi->common.mb_cols) + j] = (j % 2) + ((i%2)* 2); + //if ( j < cpi->common.mb_cols/2 ) + + // Segment 1 around the edge else 0 + if ( (i == 0) || (j == 0) || (i == (cpi->common.mb_rows-1)) || (j == (cpi->common.mb_cols-1)) ) + seg_map[(i*cpi->common.mb_cols) + j] = 1; + //else if ( (i < 2) || (j < 2) || (i > (cpi->common.mb_rows-3)) || (j > (cpi->common.mb_cols-3)) ) + // seg_map[(i*cpi->common.mb_cols) + j] = 2; + //else if ( (i < 5) || (j < 5) || (i > (cpi->common.mb_rows-6)) || (j > (cpi->common.mb_cols-6)) ) + // seg_map[(i*cpi->common.mb_cols) + j] = 3; + else + seg_map[(i*cpi->common.mb_cols) + j] = 0; + } + }*/ + + // Set the segmentation Map + set_segmentation_map(ptr, seg_map); + + // Activate segmentation. + enable_segmentation(ptr); + + // Set up the quant segment data + feature_data[MB_LVL_ALT_Q][0] = 0; + feature_data[MB_LVL_ALT_Q][1] = 4; + feature_data[MB_LVL_ALT_Q][2] = 0; + feature_data[MB_LVL_ALT_Q][3] = 0; + // Set up the loop segment data + feature_data[MB_LVL_ALT_LF][0] = 0; + feature_data[MB_LVL_ALT_LF][1] = 0; + feature_data[MB_LVL_ALT_LF][2] = 0; + feature_data[MB_LVL_ALT_LF][3] = 0; + + // Initialise the feature data structure + // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 + set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA); + + // Delete sementation map + if (seg_map != 0) + vpx_free(seg_map); + + seg_map = 0; + +} + +// A simple function to cyclically refresh the background at a lower Q +static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) +{ + unsigned char *seg_map; + signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + int i; + int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe; + int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols; + + // Create a temporary map for segmentation data. + CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + + cpi->cyclic_refresh_q = Q; + + for (i = Q; i > 0; i--) + { + if (vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*(Q + 128)) / 64)) + //if ( vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*((2*Q)+96))/64) ) + { + break; + } + } + + cpi->cyclic_refresh_q = i; + + // Only update for inter frames + if (cpi->common.frame_type != KEY_FRAME) + { + // Cycle through the macro_block rows + // MB loop to set local segmentation map + for (i = cpi->cyclic_refresh_mode_index; i < mbs_in_frame; i++) + { + // If the MB is as a candidate for clean up then mark it for possible boost/refresh (segment 1) + // The segment id may get reset to 0 later if the MB gets coded anything other than last frame 0,0 + // as only (last frame 0,0) MBs are eligable for refresh : that is to say Mbs likely to be background blocks. + if (cpi->cyclic_refresh_map[i] == 0) + { + seg_map[i] = 1; + } + else + { + seg_map[i] = 0; + + // Skip blocks that have been refreshed recently anyway. + if (cpi->cyclic_refresh_map[i] < 0) + //cpi->cyclic_refresh_map[i] = cpi->cyclic_refresh_map[i] / 16; + cpi->cyclic_refresh_map[i]++; + } + + + if (block_count > 0) + block_count--; + else + break; + + } + + // If we have gone through the frame reset to the start + cpi->cyclic_refresh_mode_index = i; + + if (cpi->cyclic_refresh_mode_index >= mbs_in_frame) + cpi->cyclic_refresh_mode_index = 0; + } + + // Set the segmentation Map + set_segmentation_map((VP8_PTR)cpi, seg_map); + + // Activate segmentation. + enable_segmentation((VP8_PTR)cpi); + + // Set up the quant segment data + feature_data[MB_LVL_ALT_Q][0] = 0; + feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q); + feature_data[MB_LVL_ALT_Q][2] = 0; + feature_data[MB_LVL_ALT_Q][3] = 0; + + // Set up the loop segment data + feature_data[MB_LVL_ALT_LF][0] = 0; + feature_data[MB_LVL_ALT_LF][1] = lf_adjustment; + feature_data[MB_LVL_ALT_LF][2] = 0; + feature_data[MB_LVL_ALT_LF][3] = 0; + + // Initialise the feature data structure + // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 + set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); + + // Delete sementation map + if (seg_map != 0) + vpx_free(seg_map); + + seg_map = 0; + +} + +static void mode_ref_lf_test_function(VP8_COMP *cpi) +{ + cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; + + vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas)); + vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + + // Test of ref frame deltas + cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2; + cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0; + cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2; + cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2; + + cpi->mb.e_mbd.mode_lf_deltas[0] = 4; // BPRED + cpi->mb.e_mbd.mode_lf_deltas[1] = -2; // Zero + cpi->mb.e_mbd.mode_lf_deltas[2] = 2; // New mv + cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv +} + +void vp8_set_speed_features(VP8_COMP *cpi) +{ + SPEED_FEATURES *sf = &cpi->sf; + int Mode = cpi->compressor_speed; + int Speed = cpi->Speed; + int i; + VP8_COMMON *cm = &cpi->common; + + // Initialise default mode frequency sampling variables + for (i = 0; i < MAX_MODES; i ++) + { + cpi->mode_check_freq[i] = 0; + cpi->mode_test_hit_counts[i] = 0; + cpi->mode_chosen_counts[i] = 0; + } + + cpi->mbs_tested_so_far = 0; + + // best quality + sf->RD = 1; + sf->search_method = NSTEP; + sf->improved_quant = 1; + sf->improved_dct = 1; + sf->auto_filter = 1; + sf->recode_loop = 1; + sf->quarter_pixel_search = 1; + sf->half_pixel_search = 1; + sf->full_freq[0] = 7; + sf->full_freq[1] = 7; + sf->min_fs_radius = 8; + sf->max_fs_radius = 32; + sf->iterative_sub_pixel = 1; + sf->optimize_coefficients = 1; + + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + + cpi->do_full[0] = 0; + cpi->do_full[1] = 0; + + // default thresholds to 0 + for (i = 0; i < MAX_MODES; i++) + sf->thresh_mult[i] = 0; + + switch (Mode) + { +#if !(CONFIG_REALTIME_ONLY) + case 0: // best quality mode + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_ZEROG ] = 0; + sf->thresh_mult[THR_ZEROA ] = 0; + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_NEARESTG ] = 0; + sf->thresh_mult[THR_NEARESTA ] = 0; + sf->thresh_mult[THR_NEARMV ] = 0; + sf->thresh_mult[THR_NEARG ] = 0; + sf->thresh_mult[THR_NEARA ] = 0; + + sf->thresh_mult[THR_DC ] = 0; + + sf->thresh_mult[THR_V_PRED ] = 1000; + sf->thresh_mult[THR_H_PRED ] = 1000; + sf->thresh_mult[THR_B_PRED ] = 2000; + sf->thresh_mult[THR_TM ] = 1000; + + sf->thresh_mult[THR_NEWMV ] = 1000; + sf->thresh_mult[THR_NEWG ] = 1000; + sf->thresh_mult[THR_NEWA ] = 1000; + + sf->thresh_mult[THR_SPLITMV ] = 2500; + sf->thresh_mult[THR_SPLITG ] = 5000; + sf->thresh_mult[THR_SPLITA ] = 5000; + + sf->full_freq[0] = 7; + sf->full_freq[1] = 15; + + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + + if (!(cpi->ref_frame_flags & VP8_LAST_FLAG)) + { + sf->thresh_mult[THR_NEWMV ] = INT_MAX; + sf->thresh_mult[THR_NEARESTMV] = INT_MAX; + sf->thresh_mult[THR_ZEROMV ] = INT_MAX; + sf->thresh_mult[THR_NEARMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + } + + if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG)) + { + sf->thresh_mult[THR_NEARESTG ] = INT_MAX; + sf->thresh_mult[THR_ZEROG ] = INT_MAX; + sf->thresh_mult[THR_NEARG ] = INT_MAX; + sf->thresh_mult[THR_NEWG ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + } + else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG)) + { + sf->thresh_mult[THR_NEARESTA ] = INT_MAX; + sf->thresh_mult[THR_ZEROA ] = INT_MAX; + sf->thresh_mult[THR_NEARA ] = INT_MAX; + sf->thresh_mult[THR_NEWA ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + } + + break; + case 1: + case 3: + sf->optimize_coefficients = 0; + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_DC ] = 0; + sf->thresh_mult[THR_NEARMV ] = 0; + sf->thresh_mult[THR_V_PRED ] = 1000; + sf->thresh_mult[THR_H_PRED ] = 1000; + sf->thresh_mult[THR_B_PRED ] = 2500; + sf->thresh_mult[THR_TM ] = 1000; + + sf->thresh_mult[THR_NEARESTG ] = 1000; + sf->thresh_mult[THR_NEARESTA ] = 1000; + + sf->thresh_mult[THR_ZEROG ] = 1000; + sf->thresh_mult[THR_ZEROA ] = 1000; + sf->thresh_mult[THR_NEARG ] = 1000; + sf->thresh_mult[THR_NEARA ] = 1000; + + sf->thresh_mult[THR_NEWMV ] = 1500; + sf->thresh_mult[THR_NEWG ] = 1500; + sf->thresh_mult[THR_NEWA ] = 1500; + + sf->thresh_mult[THR_SPLITMV ] = 5000; + sf->thresh_mult[THR_SPLITG ] = 10000; + sf->thresh_mult[THR_SPLITA ] = 10000; + + sf->full_freq[0] = 15; + sf->full_freq[1] = 31; + + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + + if (!(cpi->ref_frame_flags & VP8_LAST_FLAG)) + { + sf->thresh_mult[THR_NEWMV ] = INT_MAX; + sf->thresh_mult[THR_NEARESTMV] = INT_MAX; + sf->thresh_mult[THR_ZEROMV ] = INT_MAX; + sf->thresh_mult[THR_NEARMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + } + else if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG)) + { + sf->thresh_mult[THR_NEARESTG ] = INT_MAX; + sf->thresh_mult[THR_ZEROG ] = INT_MAX; + sf->thresh_mult[THR_NEARG ] = INT_MAX; + sf->thresh_mult[THR_NEWG ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + } + else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG)) + { + sf->thresh_mult[THR_NEARESTA ] = INT_MAX; + sf->thresh_mult[THR_ZEROA ] = INT_MAX; + sf->thresh_mult[THR_NEARA ] = INT_MAX; + sf->thresh_mult[THR_NEWA ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + } + + if (Speed > 0) + { + cpi->mode_check_freq[THR_SPLITG] = 4; + cpi->mode_check_freq[THR_SPLITA] = 4; + cpi->mode_check_freq[THR_SPLITMV] = 2; + + sf->thresh_mult[THR_TM ] = 1500; + sf->thresh_mult[THR_V_PRED ] = 1500; + sf->thresh_mult[THR_H_PRED ] = 1500; + sf->thresh_mult[THR_B_PRED ] = 5000; + + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEWMV ] = 2000; + sf->thresh_mult[THR_SPLITMV ] = 10000; + } + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 1500; + sf->thresh_mult[THR_ZEROG ] = 1500; + sf->thresh_mult[THR_NEARG ] = 1500; + sf->thresh_mult[THR_NEWG ] = 2000; + sf->thresh_mult[THR_SPLITG ] = 20000; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 1500; + sf->thresh_mult[THR_ZEROA ] = 1500; + sf->thresh_mult[THR_NEARA ] = 1500; + sf->thresh_mult[THR_NEWA ] = 2000; + sf->thresh_mult[THR_SPLITA ] = 20000; + } + + sf->improved_quant = 0; + sf->improved_dct = 0; + + sf->first_step = 1; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + } + + if (Speed > 1) + { + cpi->mode_check_freq[THR_SPLITG] = 15; + cpi->mode_check_freq[THR_SPLITA] = 15; + cpi->mode_check_freq[THR_SPLITMV] = 7; + + sf->thresh_mult[THR_TM ] = 2000; + sf->thresh_mult[THR_V_PRED ] = 2000; + sf->thresh_mult[THR_H_PRED ] = 2000; + sf->thresh_mult[THR_B_PRED ] = 7500; + + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEWMV ] = 2000; + sf->thresh_mult[THR_SPLITMV ] = 25000; + } + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 2000; + sf->thresh_mult[THR_ZEROG ] = 2000; + sf->thresh_mult[THR_NEARG ] = 2000; + sf->thresh_mult[THR_NEWG ] = 2500; + sf->thresh_mult[THR_SPLITG ] = 50000; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 2000; + sf->thresh_mult[THR_ZEROA ] = 2000; + sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEWA ] = 2500; + sf->thresh_mult[THR_SPLITA ] = 50000; + } + + // Only do recode loop on key frames and golden frames + sf->recode_loop = 2; + + sf->full_freq[0] = 31; + sf->full_freq[1] = 63; + + } + + if (Speed > 2) + { + sf->auto_filter = 0; // Faster selection of loop filter + cpi->mode_check_freq[THR_V_PRED] = 2; + cpi->mode_check_freq[THR_H_PRED] = 2; + cpi->mode_check_freq[THR_B_PRED] = 2; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + cpi->mode_check_freq[THR_NEARG] = 2; + cpi->mode_check_freq[THR_NEWG] = 4; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + cpi->mode_check_freq[THR_NEARA] = 2; + cpi->mode_check_freq[THR_NEWA] = 4; + } + + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + + sf->full_freq[0] = 63; + sf->full_freq[1] = 127; + } + + if (Speed > 3) + { + cpi->mode_check_freq[THR_V_PRED] = 0; + cpi->mode_check_freq[THR_H_PRED] = 0; + cpi->mode_check_freq[THR_B_PRED] = 0; + cpi->mode_check_freq[THR_NEARG] = 0; + cpi->mode_check_freq[THR_NEWG] = 0; + cpi->mode_check_freq[THR_NEARA] = 0; + cpi->mode_check_freq[THR_NEWA] = 0; + + sf->auto_filter = 1; + sf->recode_loop = 0; // recode loop off + sf->RD = 0; // Turn rd off + sf->full_freq[0] = INT_MAX; + sf->full_freq[1] = INT_MAX; + } + + if (Speed > 4) + { + sf->auto_filter = 0; // Faster selection of loop filter + + cpi->mode_check_freq[THR_V_PRED] = 2; + cpi->mode_check_freq[THR_H_PRED] = 2; + cpi->mode_check_freq[THR_B_PRED] = 2; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + cpi->mode_check_freq[THR_NEARG] = 2; + cpi->mode_check_freq[THR_NEWG] = 4; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + cpi->mode_check_freq[THR_NEARA] = 2; + cpi->mode_check_freq[THR_NEWA] = 4; + } + + if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 2000; + sf->thresh_mult[THR_ZEROG ] = 2000; + sf->thresh_mult[THR_NEARG ] = 2000; + sf->thresh_mult[THR_NEWG ] = 4000; + } + + if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 2000; + sf->thresh_mult[THR_ZEROA ] = 2000; + sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEWA ] = 4000; + } + } + + break; +#endif + case 2: + sf->optimize_coefficients = 0; + sf->recode_loop = 0; + sf->auto_filter = 1; + sf->iterative_sub_pixel = 1; + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_DC ] = 0; + sf->thresh_mult[THR_TM ] = 0; + sf->thresh_mult[THR_NEARMV ] = 0; + sf->thresh_mult[THR_V_PRED ] = 1000; + sf->thresh_mult[THR_H_PRED ] = 1000; + sf->thresh_mult[THR_B_PRED ] = 2500; + sf->thresh_mult[THR_NEARESTG ] = 1000; + sf->thresh_mult[THR_ZEROG ] = 1000; + sf->thresh_mult[THR_NEARG ] = 1000; + sf->thresh_mult[THR_NEARESTA ] = 1000; + sf->thresh_mult[THR_ZEROA ] = 1000; + sf->thresh_mult[THR_NEARA ] = 1000; + sf->thresh_mult[THR_NEWMV ] = 2000; + sf->thresh_mult[THR_NEWG ] = 2000; + sf->thresh_mult[THR_NEWA ] = 2000; + sf->thresh_mult[THR_SPLITMV ] = 5000; + sf->thresh_mult[THR_SPLITG ] = 10000; + sf->thresh_mult[THR_SPLITA ] = 10000; + sf->full_freq[0] = 15; + sf->full_freq[1] = 31; + sf->search_method = NSTEP; + + if (!cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEWMV ] = INT_MAX; + sf->thresh_mult[THR_NEARESTMV] = INT_MAX; + sf->thresh_mult[THR_ZEROMV ] = INT_MAX; + sf->thresh_mult[THR_NEARMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + } + + if (!cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = INT_MAX; + sf->thresh_mult[THR_ZEROG ] = INT_MAX; + sf->thresh_mult[THR_NEARG ] = INT_MAX; + sf->thresh_mult[THR_NEWG ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + } + + if (!cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = INT_MAX; + sf->thresh_mult[THR_ZEROA ] = INT_MAX; + sf->thresh_mult[THR_NEARA ] = INT_MAX; + sf->thresh_mult[THR_NEWA ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + } + + if (Speed > 0) + { + cpi->mode_check_freq[THR_SPLITG] = 4; + cpi->mode_check_freq[THR_SPLITA] = 4; + cpi->mode_check_freq[THR_SPLITMV] = 2; + + sf->thresh_mult[THR_DC ] = 0; + sf->thresh_mult[THR_TM ] = 1000; + sf->thresh_mult[THR_V_PRED ] = 2000; + sf->thresh_mult[THR_H_PRED ] = 2000; + sf->thresh_mult[THR_B_PRED ] = 5000; + + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_NEARMV ] = 0; + sf->thresh_mult[THR_NEWMV ] = 2000; + sf->thresh_mult[THR_SPLITMV ] = 10000; + } + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 1000; + sf->thresh_mult[THR_ZEROG ] = 1000; + sf->thresh_mult[THR_NEARG ] = 1000; + sf->thresh_mult[THR_NEWG ] = 2000; + sf->thresh_mult[THR_SPLITG ] = 20000; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 1000; + sf->thresh_mult[THR_ZEROA ] = 1000; + sf->thresh_mult[THR_NEARA ] = 1000; + sf->thresh_mult[THR_NEWA ] = 2000; + sf->thresh_mult[THR_SPLITA ] = 20000; + } + + sf->improved_quant = 0; + sf->improved_dct = 0; + } + + if (Speed > 1) + { + cpi->mode_check_freq[THR_SPLITMV] = 7; + cpi->mode_check_freq[THR_SPLITG] = 15; + cpi->mode_check_freq[THR_SPLITA] = 15; + + sf->thresh_mult[THR_TM ] = 2000; + sf->thresh_mult[THR_V_PRED ] = 2000; + sf->thresh_mult[THR_H_PRED ] = 2000; + sf->thresh_mult[THR_B_PRED ] = 5000; + + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEWMV ] = 2000; + sf->thresh_mult[THR_SPLITMV ] = 25000; + } + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 2000; + sf->thresh_mult[THR_ZEROG ] = 2000; + sf->thresh_mult[THR_NEARG ] = 2000; + sf->thresh_mult[THR_NEWG ] = 2500; + sf->thresh_mult[THR_SPLITG ] = 50000; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 2000; + sf->thresh_mult[THR_ZEROA ] = 2000; + sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEWA ] = 2500; + sf->thresh_mult[THR_SPLITA ] = 50000; + } + + sf->full_freq[0] = 31; + sf->full_freq[1] = 63; + } + + if (Speed > 2) + { + sf->auto_filter = 0; // Faster selection of loop filter + + cpi->mode_check_freq[THR_V_PRED] = 2; + cpi->mode_check_freq[THR_H_PRED] = 2; + cpi->mode_check_freq[THR_B_PRED] = 2; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + cpi->mode_check_freq[THR_NEARG] = 2; + cpi->mode_check_freq[THR_NEWG] = 4; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + cpi->mode_check_freq[THR_NEARA] = 2; + cpi->mode_check_freq[THR_NEWA] = 4; + } + + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + + sf->full_freq[0] = 63; + sf->full_freq[1] = 127; + } + + if (Speed > 3) + { + sf->RD = 0; + sf->full_freq[0] = INT_MAX; + sf->full_freq[1] = INT_MAX; + + sf->auto_filter = 1; + } + + if (Speed > 4) + { + sf->auto_filter = 0; // Faster selection of loop filter + +#if CONFIG_REALTIME_ONLY + sf->search_method = HEX; +#else + sf->search_method = DIAMOND; +#endif + + cpi->mode_check_freq[THR_V_PRED] = 4; + cpi->mode_check_freq[THR_H_PRED] = 4; + cpi->mode_check_freq[THR_B_PRED] = 4; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + cpi->mode_check_freq[THR_NEARG] = 2; + cpi->mode_check_freq[THR_NEWG] = 4; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + cpi->mode_check_freq[THR_NEARA] = 2; + cpi->mode_check_freq[THR_NEWA] = 4; + } + + sf->thresh_mult[THR_TM ] = 2000; + sf->thresh_mult[THR_B_PRED ] = 5000; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEARESTG ] = 2000; + sf->thresh_mult[THR_ZEROG ] = 2000; + sf->thresh_mult[THR_NEARG ] = 2000; + sf->thresh_mult[THR_NEWG ] = 4000; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEARESTA ] = 2000; + sf->thresh_mult[THR_ZEROA ] = 2000; + sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEWA ] = 4000; + } + } + + if (Speed > 5) + { + // Disable split MB intra prediction mode + sf->thresh_mult[THR_B_PRED] = INT_MAX; + } + + if (Speed > 6) + { + unsigned int i, sum = 0; + unsigned int total_mbs = cm->MBs; + int thresh; + int total_skip; + + int min = 2000; + sf->iterative_sub_pixel = 0; + + if (cpi->oxcf.encode_breakout > 2000) + min = cpi->oxcf.encode_breakout; + + min >>= 7; + + for (i = 0; i < min; i++) + { + sum += cpi->error_bins[i]; + } + + total_skip = sum; + sum = 0; + + // i starts from 2 to make sure thresh started from 2048 + for (; i < 1024; i++) + { + sum += cpi->error_bins[i]; + + if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip)) + break; + } + + i--; + thresh = (i << 7); + + if (thresh < 2000) + thresh = 2000; + + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + sf->thresh_mult[THR_NEWMV] = thresh; + sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1; + sf->thresh_mult[THR_NEARMV ] = thresh >> 1; + } + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + sf->thresh_mult[THR_NEWG] = thresh << 1; + sf->thresh_mult[THR_NEARESTG ] = thresh; + sf->thresh_mult[THR_NEARG ] = thresh; + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + sf->thresh_mult[THR_NEWA] = thresh << 1; + sf->thresh_mult[THR_NEARESTA ] = thresh; + sf->thresh_mult[THR_NEARA ] = thresh; + } + + // Disable other intra prediction modes + sf->thresh_mult[THR_TM] = INT_MAX; + sf->thresh_mult[THR_V_PRED] = INT_MAX; + sf->thresh_mult[THR_H_PRED] = INT_MAX; + + } + + if (Speed > 8) + { + sf->quarter_pixel_search = 0; + } + + if (Speed > 9) + { + int Tmp = cpi->Speed - 8; + + if (Tmp > 4) + Tmp = 4; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1); + cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1); + cpi->mode_check_freq[THR_NEARG] = 1 << Tmp; + cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1); + } + + if (cpi->ref_frame_flags & VP8_ALT_FLAG) + { + cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1); + cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1); + cpi->mode_check_freq[THR_NEARA] = 1 << Tmp; + cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1); + } + + cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1); + } + + cm->filter_type = NORMAL_LOOPFILTER; + + if (Speed >= 14) + cm->filter_type = SIMPLE_LOOPFILTER; + + if (Speed >= 15) + { + sf->half_pixel_search = 0; // This has a big hit on quality. Last resort + } + + vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins)); + + }; + + if (cpi->sf.search_method == NSTEP) + { + vp8_init3smotion_compensation(&cpi->mb, cm->last_frame.y_stride); + } + else if (cpi->sf.search_method == DIAMOND) + { + vp8_init_dsmotion_compensation(&cpi->mb, cm->last_frame.y_stride); + } + + if (cpi->sf.improved_dct) + { + cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4); + cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4); + cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4); + cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4); + } + else + { + cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4); + cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4); + cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4); + cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4); + } + + cpi->mb.vp8_short_fdct4x4_ptr = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4); + cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4); + + if (cpi->sf.improved_quant) + { + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + cpi->mb.quantize_brd = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + } + else + { + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); + cpi->mb.quantize_brd = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); + } + +#if CONFIG_RUNTIME_CPU_DETECT + cpi->mb.e_mbd.rtcd = &cpi->common.rtcd; +#endif + + if (cpi->sf.iterative_sub_pixel == 1) + { + cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively; + } + else if (cpi->sf.quarter_pixel_search) + { + cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step; + } + else if (cpi->sf.half_pixel_search) + { + cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step; + } + else + { + cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step; + } + + if (cpi->sf.optimize_coefficients == 1) + cpi->mb.optimize = 1; + else + cpi->mb.optimize = 0; + + if (cpi->common.full_pixel) + cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step; + +#ifdef SPEEDSTATS + frames_at_speed[cpi->Speed]++; +#endif +} +static void alloc_raw_frame_buffers(VP8_COMP *cpi) +{ + int i, buffers; + + buffers = cpi->oxcf.lag_in_frames; + + if (buffers > MAX_LAG_BUFFERS) + buffers = MAX_LAG_BUFFERS; + + if (buffers < 1) + buffers = 1; + + for (i = 0; i < buffers; i++) + if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer, + cpi->oxcf.Width, cpi->oxcf.Height, + 16)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate lag buffer"); + +#if VP8_TEMPORAL_ALT_REF + + if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer, + cpi->oxcf.Width, cpi->oxcf.Height, 16)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); + +#endif + + cpi->source_buffer_count = 0; +} +void vp8_alloc_compressor_data(VP8_COMP *cpi) +{ + VP8_COMMON *cm = & cpi->common; + + int width = cm->Width; + int height = cm->Height; + + if (vp8_alloc_frame_buffers(cm, width, height)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf, + width, height, VP8BORDERINPIXELS)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + + + if (cpi->tok != 0) + vpx_free(cpi->tok); + + { + unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16; + + CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + } + + // Data used for real time vc mode to see if gf needs refreshing + cpi->inter_zz_count = 0; + cpi->gf_bad_count = 0; + cpi->gf_update_recommended = 0; +} + + +// Quant MOD +static const int q_trans[] = +{ + 0, 1, 2, 3, 4, 5, 7, 8, + 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, + 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, +}; + +int vp8_reverse_trans(int x) +{ + int i; + + for (i = 0; i < 64; i++) + if (q_trans[i] >= x) + return i; + + return 63; +}; +void vp8_new_frame_rate(VP8_COMP *cpi, double framerate) +{ + cpi->oxcf.frame_rate = framerate; + cpi->output_frame_rate = cpi->oxcf.frame_rate; + cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); + cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); + cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; + + cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; + cpi->max_gf_interval = (int)(cpi->output_frame_rate / 2) + 2; + + //cpi->max_gf_interval = (int)(cpi->output_frame_rate * 2 / 3) + 1; + //cpi->max_gf_interval = 24; + + if (cpi->max_gf_interval < 12) + cpi->max_gf_interval = 12; + + + // Special conditions when altr ref frame enabled + if (cpi->oxcf.play_alternate) + { + if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1) + cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1; + } +} + +void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + VP8_COMMON *cm = &cpi->common; + + if (!cpi) + return; + + cpi->auto_gold = 1; + cpi->auto_adjust_gold_quantizer = 1; + cpi->goldquantizer = 1; + cpi->goldfreq = 7; + cpi->auto_adjust_key_quantizer = 1; + cpi->keyquantizer = 1; + + cm->version = oxcf->Version; + vp8_setup_version(cm); + + if (oxcf == 0) + { + cpi->pass = 0; + + cpi->auto_worst_q = 0; + cpi->oxcf.best_allowed_q = MINQ; + cpi->oxcf.worst_allowed_q = MAXQ; + + cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER; + cpi->oxcf.starting_buffer_level = 4; + cpi->oxcf.optimal_buffer_level = 5; + cpi->oxcf.maximum_buffer_size = 6; + cpi->oxcf.under_shoot_pct = 90; + cpi->oxcf.allow_df = 0; + cpi->oxcf.drop_frames_water_mark = 20; + + cpi->oxcf.allow_spatial_resampling = 0; + cpi->oxcf.resample_down_water_mark = 40; + cpi->oxcf.resample_up_water_mark = 60; + + cpi->oxcf.fixed_q = cpi->interquantizer; + + cpi->filter_type = NORMAL_LOOPFILTER; + + if (cm->simpler_lpf) + cpi->filter_type = SIMPLE_LOOPFILTER; + + cpi->compressor_speed = 1; + cpi->horiz_scale = 0; + cpi->vert_scale = 0; + cpi->oxcf.two_pass_vbrbias = 50; + cpi->oxcf.two_pass_vbrmax_section = 400; + cpi->oxcf.two_pass_vbrmin_section = 0; + + cpi->oxcf.Sharpness = 0; + cpi->oxcf.noise_sensitivity = 0; + } + else + cpi->oxcf = *oxcf; + + + switch (cpi->oxcf.Mode) + { + + case MODE_REALTIME: + cpi->pass = 0; + cpi->compressor_speed = 2; + + if (cpi->oxcf.cpu_used < -16) + { + cpi->oxcf.cpu_used = -16; + } + + if (cpi->oxcf.cpu_used > 16) + cpi->oxcf.cpu_used = 16; + + break; + +#if !(CONFIG_REALTIME_ONLY) + case MODE_GOODQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) + { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) + cpi->oxcf.cpu_used = 5; + + break; + + case MODE_BESTQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 0; + break; + + case MODE_FIRSTPASS: + cpi->pass = 1; + cpi->compressor_speed = 1; + break; + case MODE_SECONDPASS: + cpi->pass = 2; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) + { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) + cpi->oxcf.cpu_used = 5; + + break; + case MODE_SECONDPASS_BEST: + cpi->pass = 2; + cpi->compressor_speed = 0; + break; +#endif + } + + if (cpi->pass == 0) + cpi->auto_worst_q = 1; + + cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; + cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; + + if (oxcf->fixed_q >= 0) + { + if (oxcf->worst_allowed_q < 0) + cpi->oxcf.fixed_q = q_trans[0]; + else + cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q]; + + if (oxcf->alt_q < 0) + cpi->oxcf.alt_q = q_trans[0]; + else + cpi->oxcf.alt_q = q_trans[oxcf->alt_q]; + + if (oxcf->key_q < 0) + cpi->oxcf.key_q = q_trans[0]; + else + cpi->oxcf.key_q = q_trans[oxcf->key_q]; + + if (oxcf->gold_q < 0) + cpi->oxcf.gold_q = q_trans[0]; + else + cpi->oxcf.gold_q = q_trans[oxcf->gold_q]; + + } + + cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; + + //cpi->use_golden_frame_only = 0; + //cpi->use_last_frame_only = 0; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + cm->refresh_entropy_probs = 1; + + if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) + cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; + + setup_features(cpi); + + { + int i; + + for (i = 0; i < MAX_MB_SEGMENTS; i++) + cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; + } + + // At the moment the first order values may not be > MAXQ + if (cpi->oxcf.fixed_q > MAXQ) + cpi->oxcf.fixed_q = MAXQ; + + // local file playback mode == really big buffer + if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) + { + cpi->oxcf.starting_buffer_level = 60; + cpi->oxcf.optimal_buffer_level = 60; + cpi->oxcf.maximum_buffer_size = 240; + + } + + + // Convert target bandwidth from Kbit/s to Bit/s + cpi->oxcf.target_bandwidth *= 1000; + cpi->oxcf.starting_buffer_level *= cpi->oxcf.target_bandwidth; + + if (cpi->oxcf.optimal_buffer_level == 0) + cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; + else + cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth; + + if (cpi->oxcf.maximum_buffer_size == 0) + cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; + else + cpi->oxcf.maximum_buffer_size *= cpi->oxcf.target_bandwidth; + + cpi->buffer_level = cpi->oxcf.starting_buffer_level; + cpi->bits_off_target = cpi->oxcf.starting_buffer_level; + + vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + cpi->worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + cpi->best_quality = cpi->oxcf.best_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; + + + cpi->total_actual_bits = 0; + cpi->total_target_vs_actual = 0; + + // Only allow dropped frames in buffered mode + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + + cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; + + if (!cm->use_bilinear_mc_filter) + cm->mcomp_filter_type = SIXTAP; + else + cm->mcomp_filter_type = BILINEAR; + + cpi->target_bandwidth = cpi->oxcf.target_bandwidth; + + cm->Width = cpi->oxcf.Width ; + cm->Height = cpi->oxcf.Height ; + + cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 + + cm->horiz_scale = cpi->horiz_scale; + cm->vert_scale = cpi->vert_scale ; + + // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) + if (cpi->oxcf.Sharpness > 7) + cpi->oxcf.Sharpness = 7; + + cm->sharpness_level = cpi->oxcf.Sharpness; + + if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) + { + int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); + int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + // always go to the next whole number + cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; + cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; + } + + if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width || + ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height || + cm->last_frame.y_width == 0) + { + alloc_raw_frame_buffers(cpi); + vp8_alloc_compressor_data(cpi); + } + + // Clamp KF frame size to quarter of data rate + if (cpi->intra_frame_target > cpi->target_bandwidth >> 2) + cpi->intra_frame_target = cpi->target_bandwidth >> 2; + + if (cpi->oxcf.fixed_q >= 0) + { + cpi->last_q[0] = cpi->oxcf.fixed_q; + cpi->last_q[1] = cpi->oxcf.fixed_q; + } + + cpi->Speed = cpi->oxcf.cpu_used; + + // force to allowlag to 0 if lag_in_frames is 0; + if (cpi->oxcf.lag_in_frames == 0) + { + cpi->oxcf.allow_lag = 0; + } + // Limit on lag buffers as these are not currently dynamically allocated + else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) + cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; + + // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled. + if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS)) + { + cpi->oxcf.play_alternate = 0; + cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG; + } + + // YX Temp + cpi->last_alt_ref_sei = -1; + cpi->is_src_frame_alt_ref = 0; + +#if 0 + // Experimental RD Code + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif + +#if VP8_TEMPORAL_ALT_REF + { + int i; + + cpi->fixed_divide[0] = 0; + + for (i = 1; i < 255; i++) + cpi->fixed_divide[i] = 0x10000 / i; + } +#endif +} + +/* + * This function needs more clean up, i.e. be more tuned torwards + * change_config rather than init_config !!!!!!!!!!!!!!!! + * YX - 5/28/2009 + * + */ + +void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + VP8_COMMON *cm = &cpi->common; + + if (!cpi) + return; + + if (!oxcf) + return; + + if (cm->version != oxcf->Version) + { + cm->version = oxcf->Version; + vp8_setup_version(cm); + } + + cpi->oxcf = *oxcf; + + switch (cpi->oxcf.Mode) + { + + case MODE_REALTIME: + cpi->pass = 0; + cpi->compressor_speed = 2; + + if (cpi->oxcf.cpu_used < -16) + { + cpi->oxcf.cpu_used = -16; + } + + if (cpi->oxcf.cpu_used > 16) + cpi->oxcf.cpu_used = 16; + + break; + +#if !(CONFIG_REALTIME_ONLY) + case MODE_GOODQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) + { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) + cpi->oxcf.cpu_used = 5; + + break; + + case MODE_BESTQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 0; + break; + + case MODE_FIRSTPASS: + cpi->pass = 1; + cpi->compressor_speed = 1; + break; + case MODE_SECONDPASS: + cpi->pass = 2; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) + { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) + cpi->oxcf.cpu_used = 5; + + break; + case MODE_SECONDPASS_BEST: + cpi->pass = 2; + cpi->compressor_speed = 0; + break; +#endif + } + + if (cpi->pass == 0) + cpi->auto_worst_q = 1; + + cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; + cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; + + if (oxcf->fixed_q >= 0) + { + if (oxcf->worst_allowed_q < 0) + cpi->oxcf.fixed_q = q_trans[0]; + else + cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q]; + + if (oxcf->alt_q < 0) + cpi->oxcf.alt_q = q_trans[0]; + else + cpi->oxcf.alt_q = q_trans[oxcf->alt_q]; + + if (oxcf->key_q < 0) + cpi->oxcf.key_q = q_trans[0]; + else + cpi->oxcf.key_q = q_trans[oxcf->key_q]; + + if (oxcf->gold_q < 0) + cpi->oxcf.gold_q = q_trans[0]; + else + cpi->oxcf.gold_q = q_trans[oxcf->gold_q]; + + } + + cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + + cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; + + //cpi->use_golden_frame_only = 0; + //cpi->use_last_frame_only = 0; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + cm->refresh_entropy_probs = 1; + + if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) + cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; + + setup_features(cpi); + + { + int i; + + for (i = 0; i < MAX_MB_SEGMENTS; i++) + cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; + } + + // At the moment the first order values may not be > MAXQ + if (cpi->oxcf.fixed_q > MAXQ) + cpi->oxcf.fixed_q = MAXQ; + + // local file playback mode == really big buffer + if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) + { + cpi->oxcf.starting_buffer_level = 60; + cpi->oxcf.optimal_buffer_level = 60; + cpi->oxcf.maximum_buffer_size = 240; + + } + + // Convert target bandwidth from Kbit/s to Bit/s + cpi->oxcf.target_bandwidth *= 1000; + + cpi->oxcf.starting_buffer_level *= cpi->oxcf.target_bandwidth; + + if (cpi->oxcf.optimal_buffer_level == 0) + cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; + else + cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth; + + if (cpi->oxcf.maximum_buffer_size == 0) + cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; + else + cpi->oxcf.maximum_buffer_size *= cpi->oxcf.target_bandwidth; + + cpi->buffer_level = cpi->oxcf.starting_buffer_level; + cpi->bits_off_target = cpi->oxcf.starting_buffer_level; + + vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + cpi->worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + cpi->best_quality = cpi->oxcf.best_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; + + + cpi->total_actual_bits = 0; + cpi->total_target_vs_actual = 0; + + // Only allow dropped frames in buffered mode + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + + cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; + + if (!cm->use_bilinear_mc_filter) + cm->mcomp_filter_type = SIXTAP; + else + cm->mcomp_filter_type = BILINEAR; + + cpi->target_bandwidth = cpi->oxcf.target_bandwidth; + + cm->Width = cpi->oxcf.Width ; + cm->Height = cpi->oxcf.Height ; + + cm->horiz_scale = cpi->horiz_scale; + cm->vert_scale = cpi->vert_scale ; + + cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 + + // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) + if (cpi->oxcf.Sharpness > 7) + cpi->oxcf.Sharpness = 7; + + cm->sharpness_level = cpi->oxcf.Sharpness; + + if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) + { + int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); + int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + // always go to the next whole number + cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; + cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; + } + + if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width || + ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height || + cm->last_frame.y_width == 0) + { + alloc_raw_frame_buffers(cpi); + vp8_alloc_compressor_data(cpi); + } + + // Clamp KF frame size to quarter of data rate + if (cpi->intra_frame_target > cpi->target_bandwidth >> 2) + cpi->intra_frame_target = cpi->target_bandwidth >> 2; + + if (cpi->oxcf.fixed_q >= 0) + { + cpi->last_q[0] = cpi->oxcf.fixed_q; + cpi->last_q[1] = cpi->oxcf.fixed_q; + } + + cpi->Speed = cpi->oxcf.cpu_used; + + // force to allowlag to 0 if lag_in_frames is 0; + if (cpi->oxcf.lag_in_frames == 0) + { + cpi->oxcf.allow_lag = 0; + } + // Limit on lag buffers as these are not currently dynamically allocated + else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) + cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; + + // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled. + if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS)) + { + cpi->oxcf.play_alternate = 0; + cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG; + } + + // YX Temp + cpi->last_alt_ref_sei = -1; + cpi->is_src_frame_alt_ref = 0; + +#if 0 + // Experimental RD Code + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif + +} + +#define M_LOG2_E 0.693147180559945309417 +#define log2f(x) (log (x) / (float) M_LOG2_E) +static void cal_mvsadcosts(int *mvsadcost[2]) +{ + int i = 1; + + mvsadcost [0] [0] = 300; + mvsadcost [1] [0] = 300; + + do + { + double z = 256 * (2 * (log2f(2 * i) + .6)); + mvsadcost [0][i] = (int) z; + mvsadcost [1][i] = (int) z; + mvsadcost [0][-i] = (int) z; + mvsadcost [1][-i] = (int) z; + } + while (++i <= mv_max); +} + +VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) +{ + int i; + volatile union + { + VP8_COMP *cpi; + VP8_PTR ptr; + } ctx; + + VP8_COMP *cpi; + VP8_COMMON *cm; + + cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP)); + // Check that the CPI instance is valid + if (!cpi) + return 0; + + cm = &cpi->common; + + vpx_memset(cpi, 0, sizeof(VP8_COMP)); + + if (setjmp(cm->error.jmp)) + { + VP8_PTR ptr = ctx.ptr; + + ctx.cpi->common.error.setjmp = 0; + vp8_remove_compressor(&ptr); + return 0; + } + + cpi->common.error.setjmp = 1; + + CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA))); + CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); + + vp8_cmachine_specific_config(cpi); + vp8_create_common(&cpi->common); + + vp8_init_config((VP8_PTR)cpi, oxcf); + + memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob)); + cpi->common.current_video_frame = 0; + cpi->kf_overspend_bits = 0; + cpi->kf_bitrate_adjustment = 0; + cpi->frames_till_gf_update_due = 0; + cpi->gf_overspend_bits = 0; + cpi->non_gf_bitrate_adjustment = 0; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + cpi->prob_intra_coded = 63; + + // Prime the recent reference frame useage counters. + // Hereafter they will be maintained as a sort of moving average + cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; + cpi->recent_ref_frame_usage[LAST_FRAME] = 1; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1; + cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1; + + // Set reference frame sign bias for ALTREF frame to 1 (for now) + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1; + + cpi->gf_decay_rate = 0; + cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; + + cpi->gold_is_last = 0 ; + cpi->alt_is_last = 0 ; + cpi->gold_is_alt = 0 ; + + + + // Create the encoder segmentation map and set all entries to 0 + CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols)); + cpi->active_map_enabled = 0; + + // Create the first pass motion map structure and set to 0 + CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(cpi->common.MBs, 1)); + +#if 0 + // Experimental code for lagged and one pass + // Initialise one_pass GF frames stats + // Update stats used for GF selection + if (cpi->pass == 0) + { + cpi->one_pass_frame_index = 0; + + for (i = 0; i < MAX_LAG_BUFFERS; i++) + { + cpi->one_pass_frame_stats[i].frames_so_far = 0; + cpi->one_pass_frame_stats[i].frame_intra_error = 0.0; + cpi->one_pass_frame_stats[i].frame_coded_error = 0.0; + cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0; + cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0; + cpi->one_pass_frame_stats[i].frame_mvr = 0.0; + cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0; + cpi->one_pass_frame_stats[i].frame_mvc = 0.0; + cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0; + } + } +#endif + + // Should we use the cyclic refresh method. + // Currently this is tied to error resilliant mode + cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode; + cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 40; + cpi->cyclic_refresh_mode_index = 0; + cpi->cyclic_refresh_q = 32; + + if (cpi->cyclic_refresh_mode_enabled) + { + CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); + } + else + cpi->cyclic_refresh_map = (signed char *) NULL; + + // Test function for segmentation + //segmentation_test_function((VP8_PTR) cpi); + + // Loop filter mode / ref deltas test function + //mode_ref_lf_test_function(cpi); + +#ifdef ENTROPY_STATS + init_context_counters(); +#endif + + +#ifdef INTRARDOPT + cpi->intra_rd_opt = 1; + +#endif + + cpi->frames_since_key = 8; // Give a sensible default for the first frame. + cpi->key_frame_frequency = cpi->oxcf.key_freq; + + cpi->source_alt_ref_pending = FALSE; + cpi->source_alt_ref_active = FALSE; + cpi->common.refresh_alt_ref_frame = 0; + + cpi->b_calculate_psnr = CONFIG_PSNR; +#if CONFIG_PSNR + cpi->b_calculate_ssimg = 0; + + cpi->count = 0; + cpi->bytes = 0; + + if (cpi->b_calculate_psnr) + { + cpi->total_sq_error = 0.0; + cpi->total_sq_error2 = 0.0; + cpi->total_y = 0.0; + cpi->total_u = 0.0; + cpi->total_v = 0.0; + cpi->total = 0.0; + cpi->totalp_y = 0.0; + cpi->totalp_u = 0.0; + cpi->totalp_v = 0.0; + cpi->totalp = 0.0; + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + } + + if (cpi->b_calculate_ssimg) + { + cpi->total_ssimg_y = 0; + cpi->total_ssimg_u = 0; + cpi->total_ssimg_v = 0; + cpi->total_ssimg_all = 0; + } + +#ifndef LLONG_MAX +#define LLONG_MAX 9223372036854775807LL +#endif + cpi->first_time_stamp_ever = LLONG_MAX; + +#endif + + cpi->frames_till_gf_update_due = 0; + cpi->key_frame_count = 1; + cpi->tot_key_frame_bits = 0; + + cpi->ni_av_qi = cpi->oxcf.worst_allowed_q; + cpi->ni_tot_qi = 0; + cpi->ni_frames = 0; + cpi->total_byte_count = 0; + + cpi->drop_frame = 0; + cpi->drop_count = 0; + cpi->max_drop_count = 0; + cpi->max_consec_dropped_frames = 4; + + cpi->rate_correction_factor = 1.0; + cpi->key_frame_rate_correction_factor = 1.0; + cpi->gf_rate_correction_factor = 1.0; + cpi->est_max_qcorrection_factor = 1.0; + + cpi->mb.mvcost[0] = &cpi->mb.mvcosts[0][mv_max+1]; + cpi->mb.mvcost[1] = &cpi->mb.mvcosts[1][mv_max+1]; + cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mv_max+1]; + cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mv_max+1]; + + cal_mvsadcosts(cpi->mb.mvsadcost); + + for (i = 0; i < KEY_FRAME_CONTEXT; i++) + { + cpi->prior_key_frame_size[i] = cpi->intra_frame_target; + cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate; + } + + cpi->check_freq[0] = 15; + cpi->check_freq[1] = 15; + +#ifdef OUTPUT_YUV_SRC + yuv_file = fopen("bd.yuv", "ab"); +#endif + +#if 0 + framepsnr = fopen("framepsnr.stt", "a"); + kf_list = fopen("kf_list.stt", "w"); +#endif + + cpi->output_pkt_list = oxcf->output_pkt_list; + +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->pass == 1) + { + vp8_init_first_pass(cpi); + } + else if (cpi->pass == 2) + { + cpi->stats_in = oxcf->two_pass_stats_in.buf; + cpi->stats_in_end = cpi->stats_in + + oxcf->two_pass_stats_in.sz / sizeof(FIRSTPASS_STATS) + - 1; + vp8_init_second_pass(cpi); + } + +#endif + + if (cpi->compressor_speed == 2) + { + cpi->cpu_freq = 0; //vp8_get_processor_freq(); + cpi->avg_encode_time = 0; + cpi->avg_pick_mode_time = 0; + } + + vp8_set_speed_features(cpi); + + // Set starting values of RD threshold multipliers (128 = *1) + for (i = 0; i < MAX_MODES; i++) + { + cpi->rd_thresh_mult[i] = 128; + } + +#ifdef ENTROPY_STATS + init_mv_ref_counts(); +#endif + + vp8cx_create_encoder_threads(cpi); + + cpi->fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16); + cpi->fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16); + cpi->fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16); + cpi->fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3); + cpi->fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d); + +#if !(CONFIG_REALTIME_ONLY) + cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search); +#endif + cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search); + + cpi->ready_for_new_frame = 1; + + cpi->source_encode_index = 0; + + // make sure frame 1 is okay + cpi->error_bins[0] = cpi->common.MBs; + + //vp8cx_init_quantizer() is first called here. Add check in vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only called later + //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame. + vp8cx_init_quantizer(cpi); + { + vp8_init_loop_filter(cm); + cm->last_frame_type = KEY_FRAME; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + } + cpi->common.error.setjmp = 0; + return (VP8_PTR) cpi; + +} + + +void vp8_remove_compressor(VP8_PTR *ptr) +{ + VP8_COMP *cpi = (VP8_COMP *)(*ptr); + + if (!cpi) + return; + + if (cpi && (cpi->common.current_video_frame > 0)) + { +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->pass == 2) + { + vp8_end_second_pass(cpi); + } + +#endif + +#ifdef ENTROPY_STATS + print_context_counters(); + print_tree_update_probs(); + print_mode_context(); +#endif + +#if CONFIG_PSNR + + if (cpi->pass != 1) + { + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = (cpi->source_end_time_stamp - cpi->first_time_stamp_ever) / 10000000.000; + double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; + double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded; + + if (cpi->b_calculate_psnr) + { + double samples = 3.0 / 2 * cpi->count * cpi->common.last_frame.y_width * cpi->common.last_frame.y_height; + double total_psnr = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error); + double total_psnr2 = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error2); + double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); + + fprintf(f, "Bitrate\AVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(us)\n"); + fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f %8.0f\n", + dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim, + total_encode_time); + } + + if (cpi->b_calculate_ssimg) + { + fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(us)\n"); + fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, + cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count, + cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time); + } + + fclose(f); +#if 0 + f = fopen("qskip.stt", "a"); + fprintf(f, "minq:%d -maxq:%d skipture:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount); + fclose(f); +#endif + + } + +#endif + + +#ifdef SPEEDSTATS + + if (cpi->compressor_speed == 2) + { + int i; + FILE *f = fopen("cxspeed.stt", "a"); + cnt_pm /= cpi->common.MBs; + + for (i = 0; i < 16; i++) + fprintf(f, "%5d", frames_at_speed[i]); + + fprintf(f, "\n"); + //fprintf(f, "%10d PM %10d %10d %10d EF %10d %10d %10d\n", cpi->Speed, cpi->avg_pick_mode_time, (tot_pm/cnt_pm), cnt_pm, cpi->avg_encode_time, 0, 0); + fclose(f); + } + +#endif + + +#ifdef MODE_STATS + { + extern int count_mb_seg[4]; + FILE *f = fopen("modes.stt", "a"); + double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ; + fprintf(f, "intra_mode in Intra Frames:\n"); + fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]); + fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]); + fprintf(f, "B: "); + { + int i; + + for (i = 0; i < 10; i++) + fprintf(f, "%8d, ", b_modes[i]); + + fprintf(f, "\n"); + + } + + fprintf(f, "Modes in Inter Frames:\n"); + fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n", + inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4], + inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]); + fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]); + fprintf(f, "B: "); + { + int i; + + for (i = 0; i < 15; i++) + fprintf(f, "%8d, ", inter_b_modes[i]); + + fprintf(f, "\n"); + + } + fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]); + fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]); + + + + fclose(f); + } +#endif + +#ifdef ENTROPY_STATS + { + int i, j, k; + FILE *fmode = fopen("modecontext.c", "w"); + + fprintf(fmode, "\n#include \"entropymode.h\"\n\n"); + fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts "); + fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n"); + + for (i = 0; i < 10; i++) + { + + fprintf(fmode, " { //Above Mode : %d\n", i); + + for (j = 0; j < 10; j++) + { + + fprintf(fmode, " {"); + + for (k = 0; k < 10; k++) + { + if (!intra_mode_stats[i][j][k]) + fprintf(fmode, " %5d, ", 1); + else + fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]); + } + + fprintf(fmode, "}, // left_mode %d\n", j); + + } + + fprintf(fmode, " },\n"); + + } + + fprintf(fmode, "};\n"); + } +#endif + + +#if defined(SECTIONBITS_OUTPUT) + + if (0) + { + int i; + FILE *f = fopen("tokenbits.stt", "a"); + + for (i = 0; i < 28; i++) + fprintf(f, "%8d", (int)(Sectionbits[i] / 256)); + + fprintf(f, "\n"); + fclose(f); + } + +#endif + +#if 0 + { + printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); + printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000); + } +#endif + + } + + vp8cx_remove_encoder_threads(cpi); + + vp8_dealloc_compressor_data(cpi); + vpx_free(cpi->mb.ss); + vpx_free(cpi->tok); + vpx_free(cpi->rdtok); + vpx_free(cpi->cyclic_refresh_map); + + vp8_remove_common(&cpi->common); + vpx_free(cpi); + *ptr = 0; + +#ifdef OUTPUT_YUV_SRC + fclose(yuv_file); +#endif + +#if 0 + + if (keyfile) + fclose(keyfile); + + if (framepsnr) + fclose(framepsnr); + + if (kf_list) + fclose(kf_list); + +#endif + +} + + +static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, + unsigned char *recon, int recon_stride, + unsigned int cols, unsigned int rows, + vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row + 16 <= rows; row += 16) + { + for (col = 0; col + 16 <= cols; col += 16) + { + unsigned int sse; + + VARIANCE_INVOKE(rtcd, mse16x16)(orig + col, orig_stride, + recon + col, recon_stride, + &sse); + total_sse += sse; + } + + /* Handle odd-sized width */ + if (col < cols) + { + unsigned int border_row, border_col; + unsigned char *border_orig = orig; + unsigned char *border_recon = recon; + + for (border_row = 0; border_row < 16; border_row++) + { + for (border_col = col; border_col < cols; border_col++) + { + diff = border_orig[border_col] - border_recon[border_col]; + total_sse += diff * diff; + } + + border_orig += orig_stride; + border_recon += recon_stride; + } + } + + orig += orig_stride * 16; + recon += recon_stride * 16; + } + + /* Handle odd-sized height */ + for (; row < rows; row++) + { + for (col = 0; col < cols; col++) + { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + + return total_sse; +} + + +static void generate_psnr_packet(VP8_COMP *cpi) +{ + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + struct vpx_codec_cx_pkt pkt; + uint64_t sse; + int i; + unsigned int width = cpi->common.Width; + unsigned int height = cpi->common.Height; + + pkt.kind = VPX_CODEC_PSNR_PKT; + sse = calc_plane_error(orig->y_buffer, orig->y_stride, + recon->y_buffer, recon->y_stride, + width, height, + IF_RTCD(&cpi->rtcd.variance)); + pkt.data.psnr.sse[0] = sse; + pkt.data.psnr.sse[1] = sse; + pkt.data.psnr.samples[0] = width * height; + pkt.data.psnr.samples[1] = width * height; + + width = (width + 1) / 2; + height = (height + 1) / 2; + + sse = calc_plane_error(orig->u_buffer, orig->uv_stride, + recon->u_buffer, recon->uv_stride, + width, height, + IF_RTCD(&cpi->rtcd.variance)); + pkt.data.psnr.sse[0] += sse; + pkt.data.psnr.sse[2] = sse; + pkt.data.psnr.samples[0] += width * height; + pkt.data.psnr.samples[2] = width * height; + + sse = calc_plane_error(orig->v_buffer, orig->uv_stride, + recon->v_buffer, recon->uv_stride, + width, height, + IF_RTCD(&cpi->rtcd.variance)); + pkt.data.psnr.sse[0] += sse; + pkt.data.psnr.sse[3] = sse; + pkt.data.psnr.samples[0] += width * height; + pkt.data.psnr.samples[3] = width * height; + + for (i = 0; i < 4; i++) + pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0, + pkt.data.psnr.sse[i]); + + vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); +} + + +int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + if (ref_frame_flags > 7) + return -1 ; + + cpi->ref_frame_flags = ref_frame_flags; + return 0; +} +int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + + if (ref_frame_flags > 7) + return -1 ; + + cpi->common.refresh_golden_frame = 0; + cpi->common.refresh_alt_ref_frame = 0; + cpi->common.refresh_last_frame = 0; + + if (ref_frame_flags & VP8_LAST_FLAG) + cpi->common.refresh_last_frame = 1; + + if (ref_frame_flags & VP8_GOLD_FLAG) + cpi->common.refresh_golden_frame = 1; + + if (ref_frame_flags & VP8_ALT_FLAG) + cpi->common.refresh_alt_ref_frame = 1; + + return 0; +} + +int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + VP8_COMMON *cm = &cpi->common; + + if (ref_frame_flag == VP8_LAST_FLAG) + vp8_yv12_copy_frame_ptr(&cm->last_frame, sd); + + else if (ref_frame_flag == VP8_GOLD_FLAG) + vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd); + + else if (ref_frame_flag == VP8_ALT_FLAG) + vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd); + + else + return -1; + + return 0; +} +int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) +{ + VP8_COMP *cpi = (VP8_COMP *)(ptr); + VP8_COMMON *cm = &cpi->common; + + if (ref_frame_flag == VP8_LAST_FLAG) + vp8_yv12_copy_frame_ptr(sd, &cm->last_frame); + + else if (ref_frame_flag == VP8_GOLD_FLAG) + vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame); + + else if (ref_frame_flag == VP8_ALT_FLAG) + vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame); + + else + return -1; + + return 0; +} +int vp8_update_entropy(VP8_PTR comp, int update) +{ + VP8_COMP *cpi = (VP8_COMP *) comp; + VP8_COMMON *cm = &cpi->common; + cm->refresh_entropy_probs = update; + + return 0; +} + +void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s) +{ + FILE *yuv_file = fopen(name, "ab"); + unsigned char *src = s->y_buffer; + int h = s->y_height; + + do + { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } + while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do + { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } + while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do + { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } + while (--h); + + fclose(yuv_file); +} + +static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + // are we resizing the image + if (cm->horiz_scale != 0 || cm->vert_scale != 0) + { +#if CONFIG_SPATIAL_RESAMPLING + int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); + int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int tmp_height; + + if (cm->vert_scale == 3) + tmp_height = 9; + else + tmp_height = 11; + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer, + tmp_height, hs, hr, vs, vr, 0); + + cpi->Source = &cpi->scaled_source; +#endif + } + // we may need to copy to a buffer so we can extend the image... + else if (cm->Width != cm->last_frame.y_width || + cm->Height != cm->last_frame.y_height) + { + //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source); +#if HAVE_ARMV7 + vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source); +#else + vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source); +#endif + + cpi->Source = &cpi->scaled_source; + } + + vp8_extend_to_multiple_of16(cpi->Source, cm->Width, cm->Height); + +} +static void resize_key_frame(VP8_COMP *cpi) +{ +#if CONFIG_SPATIAL_RESAMPLING + VP8_COMMON *cm = &cpi->common; + + // Do we need to apply resampling for one pass cbr. + // In one pass this is more limited than in two pass cbr + // The test and any change is only made one per key frame sequence + if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) + { + int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); + int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int new_width, new_height; + + // If we are below the resample DOWN watermark then scale down a notch. + if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) + { + cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO; + cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO; + } + // Should we now start scaling back up + else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)) + { + cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL; + cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; + } + + // Get the new hieght and width + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; + new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs; + + // If the image size has changed we need to reallocate the buffers + // and resample the source image + if ((cm->Width != new_width) || (cm->Height != new_height)) + { + cm->Width = new_width; + cm->Height = new_height; + vp8_alloc_compressor_data(cpi); + scale_and_extend_source(cpi->un_scaled_source, cpi); + } + } + +#endif +} +// return of 0 means drop frame +static int pick_frame_size(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + // First Frame is a special case + if (cm->current_video_frame == 0) + { +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->pass == 2) + vp8_calc_auto_iframe_target_size(cpi); + + // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction + else +#endif + cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2; + + // in error resilient mode the first frame is bigger since it likely contains + // all the static background + if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2)) + { + cpi->this_frame_target *= 3; // 5; + } + + // Key frame from VFW/auto-keyframe/first frame + cm->frame_type = KEY_FRAME; + + } + // Auto key frames (Only two pass will enter here) + else if (cm->frame_type == KEY_FRAME) + { + vp8_calc_auto_iframe_target_size(cpi); + } + // Forced key frames (by interval or an external signal) + else if ((cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) + { + // Key frame from VFW/auto-keyframe/first frame + cm->frame_type = KEY_FRAME; + + resize_key_frame(cpi); + + // Compute target frame size + if (cpi->pass != 2) + vp8_calc_iframe_target_size(cpi); + } + else + { + // INTER frame: compute target frame size + cm->frame_type = INTER_FRAME; + vp8_calc_pframe_target_size(cpi); + + // Check if we're dropping the frame: + if (cpi->drop_frame) + { + cpi->drop_frame = FALSE; + cpi->drop_count++; + return 0; + } + } + + // Note target_size in bits * 256 per MB + cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs; + + return 1; +} +static void set_quantizer(VP8_COMP *cpi, int Q) +{ + VP8_COMMON *cm = &cpi->common; + MACROBLOCKD *mbd = &cpi->mb.e_mbd; + + cm->base_qindex = Q; + + cm->y1dc_delta_q = 0; + cm->y2dc_delta_q = 0; + cm->y2ac_delta_q = 0; + cm->uvdc_delta_q = 0; + cm->uvac_delta_q = 0; + + // Set Segment specific quatizers + mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0]; + mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1]; + mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2]; + mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3]; +} + +static void update_alt_ref_frame_and_stats(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + // Update the golden frame buffer + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame); + + // Select an interval before next GF or altref + if (!cpi->auto_gold) + cpi->frames_till_gf_update_due = cpi->goldfreq; + + if ((cpi->pass != 2) && cpi->frames_till_gf_update_due) + { + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + + // Set the bits per frame that we should try and recover in subsequent inter frames + // to account for the extra GF spend... note that his does not apply for GF updates + // that occur coincident with a key frame as the extra cost of key frames is dealt + // with elsewhere. + + cpi->gf_overspend_bits += cpi->projected_frame_size; + cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due; + } + + // Update data structure that monitors level of reference to last GF + vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cm->gf_active_count = cm->mb_rows * cm->mb_cols; + // this frame refreshes means next frames don't unless specified by user + + cpi->common.frames_since_golden = 0; + + // Clear the alternate reference update pending flag. + cpi->source_alt_ref_pending = FALSE; + + // Set the alternate refernce frame active flag + cpi->source_alt_ref_active = TRUE; + + +} +static void update_golden_frame_and_stats(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + // Update the Golden frame reconstruction buffer if signalled and the GF usage counts. + if (cm->refresh_golden_frame) + { + // Update the golden frame buffer + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame); + + // Select an interval before next GF + if (!cpi->auto_gold) + cpi->frames_till_gf_update_due = cpi->goldfreq; + + if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0)) + { + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + + // Set the bits per frame that we should try and recover in subsequent inter frames + // to account for the extra GF spend... note that his does not apply for GF updates + // that occur coincident with a key frame as the extra cost of key frames is dealt + // with elsewhere. + if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active) + { + // Calcluate GF bits to be recovered + // Projected size - av frame bits available for inter frames for clip as a whole + cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target); + } + + cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due; + + } + + // Update data structure that monitors level of reference to last GF + vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cm->gf_active_count = cm->mb_rows * cm->mb_cols; + + // this frame refreshes means next frames don't unless specified by user + cm->refresh_golden_frame = 0; + cpi->common.frames_since_golden = 0; + + //if ( cm->frame_type == KEY_FRAME ) + //{ + cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; + cpi->recent_ref_frame_usage[LAST_FRAME] = 1; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1; + cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1; + //} + //else + //{ + // // Carry a potrtion of count over to begining of next gf sequence + // cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5; + // cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5; + // cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5; + // cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5; + //} + + // ******** Fixed Q test code only ************ + // If we are going to use the ALT reference for the next group of frames set a flag to say so. + if (cpi->oxcf.fixed_q >= 0 && + cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) + { + cpi->source_alt_ref_pending = TRUE; + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } + + if (!cpi->source_alt_ref_pending) + cpi->source_alt_ref_active = FALSE; + + // Decrement count down till next gf + if (cpi->frames_till_gf_update_due > 0) + cpi->frames_till_gf_update_due--; + + } + else if (!cpi->common.refresh_alt_ref_frame) + { + // Decrement count down till next gf + if (cpi->frames_till_gf_update_due > 0) + cpi->frames_till_gf_update_due--; + + if (cpi->common.frames_till_alt_ref_frame) + cpi->common.frames_till_alt_ref_frame --; + + cpi->common.frames_since_golden ++; + + if (cpi->common.frames_since_golden > 1) + { + cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME]; + cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME]; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]; + cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME]; + } + } +} + +// This function updates the reference frame probability estimates that +// will be used during mode selection +static void update_rd_ref_frame_probs(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + +#if 0 + const int *const rfct = cpi->recent_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + if (cm->frame_type == KEY_FRAME) + { + cpi->prob_intra_coded = 255; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + else if (!(rf_intra + rf_inter)) + { + // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank. + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + else + { + cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); + + if (cpi->prob_intra_coded < 1) + cpi->prob_intra_coded = 1; + + if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active) + { + cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + if (cpi->prob_last_coded < 1) + cpi->prob_last_coded = 1; + + cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + if (cpi->prob_gf_coded < 1) + cpi->prob_gf_coded = 1; + } + } + +#else + const int *const rfct = cpi->count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + if (cm->frame_type == KEY_FRAME) + { + cpi->prob_intra_coded = 255; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + else if (!(rf_intra + rf_inter)) + { + // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank. + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + else + { + cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter); + + if (cpi->prob_intra_coded < 1) + cpi->prob_intra_coded = 1; + + cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + if (cpi->prob_last_coded < 1) + cpi->prob_last_coded = 1; + + cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128; + + if (cpi->prob_gf_coded < 1) + cpi->prob_gf_coded = 1; + } + + // update reference frame costs since we can do better than what we got last frame. + + if (cpi->common.refresh_alt_ref_frame) + { + cpi->prob_intra_coded += 40; + cpi->prob_last_coded = 200; + cpi->prob_gf_coded = 1; + } + else if (cpi->common.frames_since_golden == 0) + { + cpi->prob_last_coded = 214; + cpi->prob_gf_coded = 1; + } + else if (cpi->common.frames_since_golden == 1) + { + cpi->prob_last_coded = 192; + cpi->prob_gf_coded = 220; + } + else if (cpi->source_alt_ref_active) + { + //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden; + cpi->prob_gf_coded -= 20; + + if (cpi->prob_gf_coded < 10) + cpi->prob_gf_coded = 10; + } + +#endif +} + + +// 1 = key, 0 = inter +static int decide_key_frame(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + int code_key_frame = FALSE; + + cpi->kf_boost = 0; + + if (cpi->Speed > 11) + return FALSE; + + // Clear down mmx registers + vp8_clear_system_state(); //__asm emms; + + if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0)) + { + double change = 1.0 * abs((int)(cpi->intra_error - cpi->last_intra_error)) / (1 + cpi->last_intra_error); + double change2 = 1.0 * abs((int)(cpi->prediction_error - cpi->last_prediction_error)) / (1 + cpi->last_prediction_error); + double minerror = cm->MBs * 256; + +#if 0 + + if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15 + && cpi->prediction_error > minerror + && (change > .25 || change2 > .25)) + { + FILE *f = fopen("intra_inter.stt", "a"); + + if (cpi->prediction_error <= 0) + cpi->prediction_error = 1; + + fprintf(f, "%d %d %d %d %14.4f\n", + cm->current_video_frame, + (int) cpi->prediction_error, + (int) cpi->intra_error, + (int)((10 * cpi->intra_error) / cpi->prediction_error), + change); + + fclose(f); + } + +#endif + + cpi->last_intra_error = cpi->intra_error; + cpi->last_prediction_error = cpi->prediction_error; + + if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15 + && cpi->prediction_error > minerror + && (change > .25 || change2 > .25)) + { + /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/ + return TRUE; + } + + return FALSE; + + } + + // If the following are true we might as well code a key frame + if (((cpi->this_frame_percent_intra == 100) && + (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) || + ((cpi->this_frame_percent_intra > 95) && + (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5)))) + { + code_key_frame = TRUE; + } + // in addition if the following are true and this is not a golden frame then code a key frame + // Note that on golden frames there often seems to be a pop in intra useage anyway hence this + // restriction is designed to prevent spurious key frames. The Intra pop needs to be investigated. + else if (((cpi->this_frame_percent_intra > 60) && + (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) || + ((cpi->this_frame_percent_intra > 75) && + (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) || + ((cpi->this_frame_percent_intra > 90) && + (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10)))) + { + if (!cm->refresh_golden_frame) + code_key_frame = TRUE; + } + + return code_key_frame; + +} + +#if !(CONFIG_REALTIME_ONLY) +static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) +{ + (void) size; + (void) dest; + (void) frame_flags; + set_quantizer(cpi, 26); + + scale_and_extend_source(cpi->un_scaled_source, cpi); + vp8_first_pass(cpi); +} +#endif + +#if 0 +void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) +{ + + // write the frame + FILE *yframe; + int i; + char filename[255]; + + sprintf(filename, "cx\\y%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->y_height; i++) + fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe); + + fclose(yframe); + sprintf(filename, "cx\\u%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->uv_height; i++) + fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe); + + fclose(yframe); + sprintf(filename, "cx\\v%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->uv_height; i++) + fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe); + + fclose(yframe); +} +#endif +// return of 0 means drop frame + +#if VP8_TEMPORAL_ALT_REF +static void vp8cx_temp_blur1_c +( + unsigned char **frames, + int frame_count, + unsigned char *src, + unsigned char *dst, + int width, + int stride, + int height, + int strength, + int *fixed_divide, + unsigned char *motion_map_ptr, + unsigned char block_size +) +{ + int byte = 0; // Buffer offset for the current pixel value being filtered + int frame = 0; + int modifier = 0; + int i, j, k; + int block_ofset; + int Cols, Rows; + unsigned char Shift = (block_size == 16) ? 4 : 3; + + Cols = width / block_size; + Rows = height / block_size; + + for (i = 0; i < height; i++) + { + block_ofset = (i >> Shift) * Cols; + + for (j = 0; j < Cols; j ++) + { + if (motion_map_ptr[block_ofset] > 2) + { + vpx_memcpy(&dst[byte], &src[byte], block_size); + byte += block_size; + } + else + { + for (k = 0; k < block_size; k++) + { + int accumulator = 0; + int count = 0; + int src_byte = src[byte]; + + for (frame = 0; frame < frame_count; frame++) + { + // get current frame pixel value + int pixel_value = frames[frame][byte]; // int pixel_value = *frameptr; + + modifier = src_byte; // modifier = s[byte]; + modifier -= pixel_value; + modifier *= modifier; + modifier >>= strength; + modifier *= 3; + + if (modifier > 16) + modifier = 16; + + modifier = 16 - modifier; + + accumulator += modifier * pixel_value; + + count += modifier; + } + + accumulator += (count >> 1); + accumulator *= fixed_divide[count]; // accumulator *= ppi->fixed_divide[count]; + accumulator >>= 16; + + dst[byte] = accumulator; // d[byte] = accumulator; + + // move to next pixel + byte++; + } + } + + block_ofset++; + } + + // Step byte on over the UMV border to the start of the next line + byte += stride - width; + } +} + +static void vp8cx_temp_filter_c +( + VP8_COMP *cpi +) +{ + YV12_BUFFER_CONFIG *temp_source_buffer; + int *fixed_divide = cpi->fixed_divide; + + int frame = 0; + int max_frames = 11; + + int num_frames_backward = 0; + int num_frames_forward = 0; + int frames_to_blur_backward = 0; + int frames_to_blur_forward = 0; + int frames_to_blur = 0; + int start_frame = 0; + + int strength = cpi->oxcf.arnr_strength; + + int blur_type = cpi->oxcf.arnr_type; + + int new_max_frames = cpi->oxcf.arnr_max_frames; + + if (new_max_frames > 0) + max_frames = new_max_frames; + + num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index; + + if (num_frames_backward < 0) + num_frames_backward += cpi->oxcf.lag_in_frames; + + num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1); + + switch (blur_type) + { + case 1: + ///////////////////////////////////////// + // Backward Blur + + frames_to_blur_backward = num_frames_backward; + + if (frames_to_blur_backward >= max_frames) + frames_to_blur_backward = max_frames - 1; + + frames_to_blur = frames_to_blur_backward + 1; + break; + + case 2: + ///////////////////////////////////////// + // Forward Blur + + frames_to_blur_forward = num_frames_forward; + + if (frames_to_blur_forward >= max_frames) + frames_to_blur_forward = max_frames - 1; + + frames_to_blur = frames_to_blur_forward + 1; + break; + + case 3: + ///////////////////////////////////////// + // Center Blur + frames_to_blur_forward = num_frames_forward; + frames_to_blur_backward = num_frames_backward; + + if (frames_to_blur_forward > frames_to_blur_backward) + frames_to_blur_forward = frames_to_blur_backward; + + if (frames_to_blur_backward > frames_to_blur_forward) + frames_to_blur_backward = frames_to_blur_forward; + + if (frames_to_blur_forward > (max_frames / 2)) + frames_to_blur_forward = (max_frames / 2); + + if (frames_to_blur_backward > (max_frames / 2)) + frames_to_blur_backward = (max_frames / 2); + + frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; + break; + + default: + ///////////////////////////////////////// + // At most 4 frames forward Blur + frames_to_blur_forward = 4; + frames_to_blur_backward = num_frames_backward; + + if (max_frames > 5) + { + if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames) + { + frames_to_blur_backward = max_frames - frames_to_blur_forward - 1; + } + } + else + { + frames_to_blur_forward = max_frames - 1; + frames_to_blur_backward = 0; + } + + frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; + break; + } + + start_frame = (cpi->last_alt_ref_sei + frames_to_blur_forward) % cpi->oxcf.lag_in_frames; + +#ifdef DEBUGFWG + // DEBUG FWG + printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d" + , max_frames + , num_frames_backward + , num_frames_forward + , frames_to_blur + , frames_to_blur_backward + , frames_to_blur_forward + , cpi->source_encode_index + , cpi->last_alt_ref_sei + , start_frame); +#endif + + for (frame = 0; frame < frames_to_blur; frame++) + { + int which_buffer = start_frame - frame; + + if (which_buffer < 0) + which_buffer += cpi->oxcf.lag_in_frames; + + cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.y_buffer; + } + + temp_source_buffer = &cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer; + + // Blur Y + vp8cx_temp_blur1_c( + cpi->frames, + frames_to_blur, + temp_source_buffer->y_buffer, // cpi->Source->y_buffer, + cpi->alt_ref_buffer.source_buffer.y_buffer, // cpi->Source->y_buffer, + temp_source_buffer->y_width, + temp_source_buffer->y_stride, + temp_source_buffer->y_height, + //temp_source_buffer->y_height * temp_source_buffer->y_stride, + strength, + fixed_divide, + cpi->fp_motion_map, 16); + + for (frame = 0; frame < frames_to_blur; frame++) + { + int which_buffer = cpi->last_alt_ref_sei - frame; + + if (which_buffer < 0) + which_buffer += cpi->oxcf.lag_in_frames; + + cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.u_buffer; + } + + // Blur U + vp8cx_temp_blur1_c( + cpi->frames, + frames_to_blur, + temp_source_buffer->u_buffer, + cpi->alt_ref_buffer.source_buffer.u_buffer, // cpi->Source->u_buffer, + temp_source_buffer->uv_width, + temp_source_buffer->uv_stride, + temp_source_buffer->uv_height, + //temp_source_buffer->uv_height * temp_source_buffer->uv_stride, + strength, + fixed_divide, + cpi->fp_motion_map, 8); + + for (frame = 0; frame < frames_to_blur; frame++) + { + int which_buffer = cpi->last_alt_ref_sei - frame; + + if (which_buffer < 0) + which_buffer += cpi->oxcf.lag_in_frames; + + cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.v_buffer; + } + + // Blur V + vp8cx_temp_blur1_c( + cpi->frames, + frames_to_blur, + temp_source_buffer->v_buffer, + cpi->alt_ref_buffer.source_buffer.v_buffer, // cpi->Source->v_buffer, + temp_source_buffer->uv_width, + temp_source_buffer->uv_stride, + //temp_source_buffer->uv_height * temp_source_buffer->uv_stride, + temp_source_buffer->uv_height, + strength, + fixed_divide, + cpi->fp_motion_map, 8); +} +#endif + + +static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) +{ + int Q; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + + int Loop = FALSE; + int loop_count; + int this_q; + int last_zbin_oq; + + int q_low; + int q_high; + int zbin_oq_high; + int zbin_oq_low = 0; + int top_index; + int bottom_index; + VP8_COMMON *cm = &cpi->common; + int active_worst_qchanged = FALSE; + + int overshoot_seen = FALSE; + int undershoot_seen = FALSE; + int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100; + int drop_mark75 = drop_mark * 2 / 3; + int drop_mark50 = drop_mark / 4; + int drop_mark25 = drop_mark / 8; + + // Clear down mmx registers to allow floating point in what follows + vp8_clear_system_state(); + + // Test code for segmentation of gf/arf (0,0) + //segmentation_test_function((VP8_PTR) cpi); + + // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->pass == 2) + { + if (cpi->common.refresh_alt_ref_frame) + { + cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for the alt ref frame + cpi->target_bandwidth = cpi->gf_bits * cpi->output_frame_rate; // per second target bitrate + } + } + else +#endif + cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_frame_rate); + + // Default turn off buffer to buffer copying + cm->copy_buffer_to_gf = 0; + cm->copy_buffer_to_arf = 0; + + // Clear zbin over-quant value and mode boost values. + cpi->zbin_over_quant = 0; + cpi->zbin_mode_boost = 0; + + // Enable mode based tweaking of the zbin + cpi->zbin_mode_boost_enabled = TRUE; + + // Current default encoder behaviour for the altref sign bias + if (cpi->source_alt_ref_active) + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1; + else + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0; + + // Check to see if a key frame is signalled + // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass. + if ((cm->current_video_frame == 0) || + (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) + { + // Key frame from VFW/auto-keyframe/first frame + cm->frame_type = KEY_FRAME; + } + + // Set default state for segment and mode based loop filter update flags + cpi->mb.e_mbd.update_mb_segmentation_map = 0; + cpi->mb.e_mbd.update_mb_segmentation_data = 0; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; + + // Set various flags etc to special state if it is a key frame + if (cm->frame_type == KEY_FRAME) + { + int i; + + // If segmentation is enabled force a map update for key frames + if (cpi->mb.e_mbd.segmentation_enabled) + { + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; + } + + // If mode or reference frame based loop filter deltas are enabled then force an update for key frames. + if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled) + { + cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; + } + + // The alternate reference frame cannot be active for a key frame + cpi->source_alt_ref_active = FALSE; + + // Reset the RD threshold multipliers to default of * 1 (128) + for (i = 0; i < MAX_MODES; i++) + { + cpi->rd_thresh_mult[i] = 128; + } + } + + // Test code for segmentation + //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0)) + //if ( (cm->current_video_frame % 2) == 0 ) + // enable_segmentation((VP8_PTR)cpi); + //else + // disable_segmentation((VP8_PTR)cpi); + +#if 0 + // Experimental code for lagged compress and one pass + // Initialise one_pass GF frames stats + // Update stats used for GF selection + //if ( cpi->pass == 0 ) + { + cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS; + + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0; + } +#endif + + update_rd_ref_frame_probs(cpi); + + if (cpi->drop_frames_allowed) + { + // The reset to decimation 0 is only done here for one pass. + // Once it is set two pass leaves decimation on till the next kf. + if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) + cpi->decimation_factor --; + + if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) + cpi->decimation_factor = 1; + + else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) + { + cpi->decimation_factor = 3; + } + else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) + { + cpi->decimation_factor = 2; + } + else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) + { + cpi->decimation_factor = 1; + } + + //vpx_log("Encoder: Decimation Factor: %d \n",cpi->decimation_factor); + } + + // The following decimates the frame rate according to a regular pattern (i.e. to 1/2 or 2/3 frame rate) + // This can be used to help prevent buffer under-run in CBR mode. Alternatively it might be desirable in + // some situations to drop frame rate but throw more bits at each frame. + // + // Note that dropping a key frame can be problematic if spatial resampling is also active + if (cpi->decimation_factor > 0) + { + switch (cpi->decimation_factor) + { + case 1: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; + break; + case 2: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + case 3: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + } + + // Note that we should not throw out a key frame (especially when spatial resampling is enabled). + if ((cm->frame_type == KEY_FRAME)) // && cpi->oxcf.allow_spatial_resampling ) + { + cpi->decimation_count = cpi->decimation_factor; + } + else if (cpi->decimation_count > 0) + { + cpi->decimation_count --; + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + cm->current_video_frame++; + cpi->frames_since_key++; + +#if CONFIG_PSNR + cpi->count ++; +#endif + + cpi->buffer_level = cpi->bits_off_target; + + return; + } + else + cpi->decimation_count = cpi->decimation_factor; + } + + // Decide how big to make the frame + if (!pick_frame_size(cpi)) + { + cm->current_video_frame++; + cpi->frames_since_key++; + return; + } + + // Reduce active_worst_allowed_q for CBR if our buffer is getting too full. + // This has a knock on effect on active best quality as well. + // For CBR if the buffer reaches its maximum level then we can no longer + // save up bits for later frames so we might as well use them up + // on the current frame. + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode) + { + int Adjustment = cpi->active_worst_quality / 4; // Max adjustment is 1/4 + + if (Adjustment) + { + int buff_lvl_step; + int tmp_lvl = cpi->buffer_level; + + if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size) + { + buff_lvl_step = (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level) / Adjustment; + + if (buff_lvl_step) + { + Adjustment = (cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / buff_lvl_step; + cpi->active_worst_quality -= Adjustment; + } + } + else + { + cpi->active_worst_quality -= Adjustment; + } + } + } + + // Set an active best quality and if necessary active worst quality + if (cpi->pass == 2 || (cm->current_video_frame > 150)) + { + //if ( (cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame ) + int Q; + int i; + int bpm_target; + + Q = cpi->active_worst_quality; + + if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) + { + vp8_clear_system_state(); + + if (cm->frame_type != KEY_FRAME) + { + // Where a gf overlays an existing arf then allow active max Q to drift to highest allowed value. + //if ( cpi->common.refresh_golden_frame && cpi->source_alt_ref_active ) + //cpi->active_worst_quality = cpi->worst_quality; + + if (cpi->avg_frame_qindex < cpi->active_worst_quality) + Q = cpi->avg_frame_qindex; + + if (cpi->section_is_low_motion) + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 3 / 2) + 128)) / 64; + else if (cpi->section_is_fast_motion) + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 128)) / 64; + else + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 5 / 4) + 128)) / 64; + } + // KEY FRAMES + else + { + if (cpi->section_is_low_motion) + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 240)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127 + else + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64; + } + + for (i = Q; i > 0; i--) + { + if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i]) + break; + } + + cpi->active_best_quality = i; + + // this entire section could be replaced by a look up table +#if 0 + { + int Q, best_q[128]; + + for (Q = 0; Q < 128; Q++) + { + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127 + + for (i = Q; i > 0; i--) + { + if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i]) + break; + } + + best_q[Q] = i; + } + + Q += 0; + } +#endif + + } + else + { + vp8_clear_system_state(); + + //bpm_target = (vp8_bits_per_mb[cm->frame_type][Q]*(Q+128))/64; // Approx 2 to 4 where Q has the range 0-127 + bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 192)) / 128; // Approx * 1.5 to 2.5 where Q has range 0-127 + + for (i = Q; i > 0; i--) + { + if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i]) + break; + } + + cpi->active_best_quality = i; + } + + // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames + // to prevent bits just going to waste. + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause + if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) + cpi->active_best_quality = cpi->best_quality; + + else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level) + { + int Fraction = ((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128) / (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level); + int min_qadjustment = ((cpi->active_best_quality - cpi->best_quality) * Fraction) / 128; + + cpi->active_best_quality -= min_qadjustment; + } + + } + } + + // Clip the active best and worst quality values to limits + if (cpi->active_worst_quality > cpi->worst_quality) + cpi->active_worst_quality = cpi->worst_quality; + + if (cpi->active_best_quality < cpi->best_quality) + cpi->active_best_quality = cpi->best_quality; + else if (cpi->active_best_quality > cpi->active_worst_quality) + cpi->active_best_quality = cpi->active_worst_quality; + + // Determine initial Q to try + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + last_zbin_oq = cpi->zbin_over_quant; + + // Set highest allowed value for Zbin over quant + if (cm->frame_type == KEY_FRAME) + zbin_oq_high = 0; //ZBIN_OQ_MAX/16 + else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active)) + zbin_oq_high = 16; + else + zbin_oq_high = ZBIN_OQ_MAX; + + // Setup background Q adjustment for error resilliant mode + if (cpi->cyclic_refresh_mode_enabled) + cyclic_background_refresh(cpi, Q, 0); + + vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); + + // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8). + bottom_index = cpi->active_best_quality; + top_index = cpi->active_worst_quality; + + vp8_save_coding_context(cpi); + + loop_count = 0; + + q_low = cpi->best_quality; + q_high = cpi->worst_quality; + + + scale_and_extend_source(cpi->un_scaled_source, cpi); +#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC + + if (cpi->oxcf.noise_sensitivity > 0) + { + unsigned char *src; + int l = 0; + + switch (cpi->oxcf.noise_sensitivity) + { + case 1: + l = 20; + break; + case 2: + l = 40; + break; + case 3: + l = 60; + break; + case 4: + l = 80; + break; + case 5: + l = 100; + break; + case 6: + l = 150; + break; + } + + + if (cm->frame_type == KEY_FRAME) + { + vp8_de_noise(cpi->Source, cpi->Source, l , 1, 0, RTCD(postproc)); + cpi->ppi.frame = 0; + } + else + { + vp8_de_noise(cpi->Source, cpi->Source, l , 1, 0, RTCD(postproc)); + + src = cpi->Source->y_buffer; + + if (cpi->Source->y_stride < 0) + { + src += cpi->Source->y_stride * (cpi->Source->y_height - 1); + } + + //temp_filter(&cpi->ppi,src,src, + // cm->last_frame.y_width * cm->last_frame.y_height, + // cpi->oxcf.noise_sensitivity); + } + } + +#endif + +#ifdef OUTPUT_YUV_SRC + vp8_write_yuv_frame(cpi->Source); +#endif + + do + { + vp8_clear_system_state(); //__asm emms; + + /* + if(cpi->is_src_frame_alt_ref) + Q = 127; + */ + + set_quantizer(cpi, Q); + this_q = Q; + + // setup skip prob for costing in mode/mv decision + if (cpi->common.mb_no_coeff_skip) + { + cpi->prob_skip_false = cpi->base_skip_false_prob[Q]; + + if (cm->frame_type != KEY_FRAME) + { + if (cpi->common.refresh_alt_ref_frame) + { + if (cpi->last_skip_false_probs[2] != 0) + cpi->prob_skip_false = cpi->last_skip_false_probs[2]; + + /* + if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[2]; + else if (cpi->last_skip_false_probs[2]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[2] + cpi->prob_skip_false ) / 2; + */ + } + else if (cpi->common.refresh_golden_frame) + { + if (cpi->last_skip_false_probs[1] != 0) + cpi->prob_skip_false = cpi->last_skip_false_probs[1]; + + /* + if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[1]; + else if (cpi->last_skip_false_probs[1]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[1] + cpi->prob_skip_false ) / 2; + */ + } + else + { + if (cpi->last_skip_false_probs[0] != 0) + cpi->prob_skip_false = cpi->last_skip_false_probs[0]; + + /* + if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[0]; + else if(cpi->last_skip_false_probs[0]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[0] + cpi->prob_skip_false ) / 2; + */ + } + + //as this is for cost estimate, let's make sure it does not go extreme eitehr way + if (cpi->prob_skip_false < 5) + cpi->prob_skip_false = 5; + + if (cpi->prob_skip_false > 250) + cpi->prob_skip_false = 250; + + if (cpi->is_src_frame_alt_ref) + cpi->prob_skip_false = 1; + + + } + +#if 0 + + if (cpi->pass != 1) + { + FILE *f = fopen("skip.stt", "a"); + fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false); + fclose(f); + } + +#endif + + } + + if (cm->frame_type == KEY_FRAME) + vp8_setup_key_frame(cpi); + + // transform / motion compensation build reconstruction frame + + vp8_encode_frame(cpi); + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); + cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; + + vp8_clear_system_state(); //__asm emms; + + // Test to see if the stats generated for this frame indicate that we should have coded a key frame + // (assuming that we didn't)! + if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME) + { + if (decide_key_frame(cpi)) + { + vp8_calc_auto_iframe_target_size(cpi); + + // Reset all our sizing numbers and recode + cm->frame_type = KEY_FRAME; + + // Clear the Alt reference frame active flag when we have a key frame + cpi->source_alt_ref_active = FALSE; + + // If segmentation is enabled force a map update for key frames + if (cpi->mb.e_mbd.segmentation_enabled) + { + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; + } + + // If mode or reference frame based loop filter deltas are enabled then force an update for key frames. + if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled) + { + cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; + } + + vp8_restore_coding_context(cpi); + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + q_low = cpi->best_quality; + q_high = cpi->worst_quality; + + vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); + + // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8). + bottom_index = cpi->active_best_quality; + top_index = cpi->active_worst_quality; + + + loop_count++; + Loop = TRUE; + + resize_key_frame(cpi); + continue; + } + } + + vp8_clear_system_state(); + + if (frame_over_shoot_limit == 0) + frame_over_shoot_limit = 1; + + // Are we are overshooting and up against the limit of active max Q. + if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) && + (Q == cpi->active_worst_quality) && + (cpi->active_worst_quality < cpi->worst_quality) && + (cpi->projected_frame_size > frame_over_shoot_limit)) + { + int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit; + + // If so is there any scope for relaxing it + while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0)) + { + cpi->active_worst_quality++; + top_index = cpi->active_worst_quality; + over_size_percent = (int)(over_size_percent * 0.96); // Assume 1 qstep = about 4% on frame size. + } + + // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop. + active_worst_qchanged = TRUE; + } + else + active_worst_qchanged = FALSE; + +#if !(CONFIG_REALTIME_ONLY) + + // Is the projected frame size out of range and are we allowed to attempt to recode. + if (((cpi->sf.recode_loop == 1) || + ((cpi->sf.recode_loop == 2) && (cm->refresh_golden_frame || (cm->frame_type == KEY_FRAME)))) && + (((cpi->projected_frame_size > frame_over_shoot_limit) && (Q < top_index)) || + //((cpi->projected_frame_size > frame_over_shoot_limit ) && (Q == top_index) && (cpi->zbin_over_quant < ZBIN_OQ_MAX)) || + ((cpi->projected_frame_size < frame_under_shoot_limit) && (Q > bottom_index))) + ) + { + int last_q = Q; + int Retries = 0; + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + if (cpi->projected_frame_size > frame_over_shoot_limit) + { + //if ( cpi->zbin_over_quant == 0 ) + q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value + + if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low + zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; + + //if ( undershoot_seen || (Q == MAXQ) ) + if (undershoot_seen) + { + // Update rate_correction_factor unless cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 1); + + Q = (q_high + q_low + 1) / 2; + + // Adjust cpi->zbin_over_quant (only allowed when Q is max) + if (Q < MAXQ) + cpi->zbin_over_quant = 0; + else + { + zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; + cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; + } + } + else + { + // Update rate_correction_factor unless cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 0); + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) + { + vp8_update_rate_correction_factors(cpi, 0); + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + Retries ++; + } + } + + overshoot_seen = TRUE; + } + else + { + if (cpi->zbin_over_quant == 0) + q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant + else // else lower zbin_oq_high + zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low; + + if (overshoot_seen) + { + // Update rate_correction_factor unless cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 1); + + Q = (q_high + q_low) / 2; + + // Adjust cpi->zbin_over_quant (only allowed when Q is max) + if (Q < MAXQ) + cpi->zbin_over_quant = 0; + else + cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; + } + else + { + // Update rate_correction_factor unless cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 0); + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) + { + vp8_update_rate_correction_factors(cpi, 0); + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + Retries ++; + } + } + + undershoot_seen = TRUE; + } + + // Clamp Q to upper and lower limits: + if (Q > q_high) + Q = q_high; + else if (Q < q_low) + Q = q_low; + + // Clamp cpi->zbin_over_quant + cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant; + + //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE; + Loop = ((Q != last_q)) ? TRUE : FALSE; + last_zbin_oq = cpi->zbin_over_quant; + } + else +#endif + Loop = FALSE; + + if (cpi->is_src_frame_alt_ref) + Loop = FALSE; + + if (Loop == TRUE) + { + vp8_restore_coding_context(cpi); + loop_count++; +#if CONFIG_PSNR + cpi->tot_recode_hits++; +#endif + } + } + while (Loop == TRUE); + +#if 0 + // Experimental code for lagged and one pass + // Update stats used for one pass GF selection + { + /* + int frames_so_far; + double frame_intra_error; + double frame_coded_error; + double frame_pcnt_inter; + double frame_pcnt_motion; + double frame_mvr; + double frame_mvr_abs; + double frame_mvc; + double frame_mvc_abs; + */ + + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0; + } +#endif + + // Update the GF useage maps. + // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter + vp8_update_gf_useage_maps(cm, &cpi->mb.e_mbd); + + if (cm->frame_type == KEY_FRAME) + cm->refresh_last_frame = 1; + + if (0) + { + FILE *f = fopen("gfactive.stt", "a"); + fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame); + fclose(f); + } + + // For inter frames the current default behaviour is that when cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer + // This is purely an encoder descision at present. + if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame) + cm->copy_buffer_to_arf = 2; + else + cm->copy_buffer_to_arf = 0; + + if (cm->refresh_last_frame) + { + vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame); + cm->frame_to_show = &cm->last_frame; + } + else + cm->frame_to_show = &cm->new_frame; + + + + //#pragma omp parallel sections + { + + //#pragma omp section + { + + struct vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + + if (cpi->sf.auto_filter == 0) + vp8cx_pick_filter_level_fast(cpi->Source, cpi); + else + vp8cx_pick_filter_level(cpi->Source, cpi); + + vpx_usec_timer_mark(&timer); + + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + + if (cm->no_lpf) + cm->filter_level = 0; + + if (cm->filter_level > 0) + { + vp8cx_set_alt_lf_level(cpi, cm->filter_level); + vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + } + + vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); + + if (cpi->oxcf.error_resilient_mode == 1) + { + cm->refresh_entropy_probs = 0; + } + + } +//#pragma omp section + { + // build the bitstream + vp8_pack_bitstream(cpi, dest, size); + } + } + + + // At this point the new frame has been encoded coded. + // If any buffer copy / swaping is signalled it should be done here. + if (cm->frame_type == KEY_FRAME) + { + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame); + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame); + } + else // For non key frames + { + // Code to copy between reference buffers + if (cm->copy_buffer_to_arf) + { + if (cm->copy_buffer_to_arf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame); + else + vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame); + } + else if (cm->copy_buffer_to_arf == 2) + vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame); + } + + if (cm->copy_buffer_to_gf) + { + if (cm->copy_buffer_to_gf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame); + else + vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame); + } + else if (cm->copy_buffer_to_gf == 2) + vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame); + } + } + + // Update rate control heuristics + cpi->total_byte_count += (*size); + cpi->projected_frame_size = (*size) << 3; + + if (!active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 2); + + cpi->last_q[cm->frame_type] = cm->base_qindex; + + if (cm->frame_type == KEY_FRAME) + { + vp8_adjust_key_frame_context(cpi); + } + + // Keep a record of ambient average Q. + if (cm->frame_type == KEY_FRAME) + cpi->avg_frame_qindex = cm->base_qindex; + else + cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; + + // Keep a record from which we can calculate the average Q excluding GF updates and key frames + if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) + { + cpi->ni_frames++; + + // Calculate the average Q for normal inter frames (not key or GFU frames) + // This is used as a basis for setting active worst quality. + if (cpi->ni_frames > 150) + { + cpi->ni_tot_qi += Q; + cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames); + } + // Early in the clip ... average the current frame Q value with the default + // entered by the user as a dampening measure + else + { + cpi->ni_tot_qi += Q; + cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2; + } + + // If the average Q is higher than what was used in the last frame + // (after going through the recode loop to keep the frame size within range) + // then use the last frame value - 1. + // The -1 is designed to stop Q and hence the data rate, from progressively + // falling away during difficult sections, but at the same time reduce the number of + // itterations around the recode loop. + if (Q > cpi->ni_av_qi) + cpi->ni_av_qi = Q - 1; + + } + +#if 0 + + // If the frame was massively oversize and we are below optimal buffer level drop next frame + if ((cpi->drop_frames_allowed) && + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) && + (cpi->projected_frame_size > (4 * cpi->this_frame_target))) + { + cpi->drop_frame = TRUE; + } + +#endif + + // Set the count for maximum consequative dropped frames based upon the ratio of + // this frame size to the target average per frame bandwidth. + // (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0. + if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0)) + { + cpi->max_drop_count = cpi->projected_frame_size / cpi->av_per_frame_bandwidth; + + if (cpi->max_drop_count > cpi->max_consec_dropped_frames) + cpi->max_drop_count = cpi->max_consec_dropped_frames; + } + + // Update the buffer level variable. + if (cpi->common.refresh_alt_ref_frame) + cpi->bits_off_target -= cpi->projected_frame_size; + else + cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size; + + // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass. + cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; + cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; + cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; + cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32; + + // Actual bits spent + cpi->total_actual_bits += cpi->projected_frame_size; + + // Debug stats + cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size); + + cpi->buffer_level = cpi->bits_off_target; + + // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames + if (cm->frame_type == KEY_FRAME) + { + cpi->kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; + + if (cpi->kf_group_bits < 0) + cpi->kf_group_bits = 0 ; + } + else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) + { + cpi->gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; + + if (cpi->gf_group_bits < 0) + cpi->gf_group_bits = 0 ; + } + + if (cm->frame_type != KEY_FRAME) + { + if (cpi->common.refresh_alt_ref_frame) + { + cpi->last_skip_false_probs[2] = cpi->prob_skip_false; + cpi->last_skip_probs_q[2] = cm->base_qindex; + } + else if (cpi->common.refresh_golden_frame) + { + cpi->last_skip_false_probs[1] = cpi->prob_skip_false; + cpi->last_skip_probs_q[1] = cm->base_qindex; + } + else + { + cpi->last_skip_false_probs[0] = cpi->prob_skip_false; + cpi->last_skip_probs_q[0] = cm->base_qindex; + + //update the baseline + cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false; + + } + } + +#if CONFIG_PSNR + + if (0) + { + FILE *f = fopen("tmp.stt", "a"); + + vp8_clear_system_state(); //__asm emms; + + if (cpi->total_coded_error_left != 0.0) + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, (double)cpi->bits_left / cpi->total_coded_error_left, cpi->tot_recode_hits); + else + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, cpi->tot_recode_hits); + + fclose(f); + + { + FILE *fmodes = fopen("Modes.stt", "a"); + int i; + + fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, cm->frame_type, cm->refresh_golden_frame, cm->refresh_alt_ref_frame); + + for (i = 0; i < MAX_MODES; i++) + fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); + + fprintf(fmodes, "\n"); + + fclose(fmodes); + } + } + +#endif + + // If this was a kf or Gf note the Q + if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame) + cm->last_kf_gf_q = cm->base_qindex; + + if (cm->refresh_golden_frame == 1) + cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; + else + cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN; + + if (cm->refresh_alt_ref_frame == 1) + cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; + else + cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF; + + + if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed + cpi->gold_is_last = 1; + else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + cpi->gold_is_last = 0; + + if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed + cpi->alt_is_last = 1; + else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other + cpi->alt_is_last = 0; + + if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed + cpi->gold_is_alt = 1; + else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + cpi->gold_is_alt = 0; + + cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; + + if (cpi->gold_is_last) + cpi->ref_frame_flags &= !VP8_GOLD_FLAG; + + if (cpi->alt_is_last) + cpi->ref_frame_flags &= !VP8_ALT_FLAG; + + if (cpi->gold_is_alt) + cpi->ref_frame_flags &= !VP8_ALT_FLAG; + + + if (cpi->oxcf.error_resilient_mode) + { + // Is this an alternate reference update + if (cpi->common.refresh_alt_ref_frame) + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame); + + if (cpi->common.refresh_golden_frame) + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame); + } + else + { + if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame) + // Update the alternate reference frame and stats as appropriate. + update_alt_ref_frame_and_stats(cpi); + else + // Update the Golden frame and golden frame and stats as appropriate. + update_golden_frame_and_stats(cpi); + } + + if (cm->frame_type == KEY_FRAME) + { + // Tell the caller that the frame was coded as a key frame + *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY; + + // As this frame is a key frame the next defaults to an inter frame. + cm->frame_type = INTER_FRAME; + + cpi->last_frame_percent_intra = 100; + } + else + { + *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY; + + cpi->last_frame_percent_intra = cpi->this_frame_percent_intra; + } + + // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas. + cpi->mb.e_mbd.update_mb_segmentation_map = 0; + cpi->mb.e_mbd.update_mb_segmentation_data = 0; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; + + + // Dont increment frame counters if this was an altref buffer update not a real frame + if (cm->show_frame) + { + cm->current_video_frame++; + cpi->frames_since_key++; + } + + // reset to normal state now that we are done. + + + + if (0) + { + char filename[512]; + FILE *recon_file; + sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); + recon_file = fopen(filename, "wb"); + fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file); + fclose(recon_file); + } + + // DEBUG + //vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); + + +} + +int vp8_is_gf_update_needed(VP8_PTR ptr) +{ + VP8_COMP *cpi = (VP8_COMP *) ptr; + int ret_val; + + ret_val = cpi->gf_update_recommended; + cpi->gf_update_recommended = 0; + + return ret_val; +} + +void vp8_check_gf_quality(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + int gf_active_pct = (100 * cm->gf_active_count) / (cm->mb_rows * cm->mb_cols); + int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols); + int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols); + + // Gf refresh is not currently being signalled + if (cpi->gf_update_recommended == 0) + { + if (cpi->common.frames_since_golden > 7) + { + // Low use of gf + if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15)) + { + // ...but last frame zero zero usage is reasonbable so a new gf might be appropriate + if (last_ref_zz_useage >= 25) + { + cpi->gf_bad_count ++; + + if (cpi->gf_bad_count >= 8) // Check that the condition is stable + { + cpi->gf_update_recommended = 1; + cpi->gf_bad_count = 0; + } + } + else + cpi->gf_bad_count = 0; // Restart count as the background is not stable enough + } + else + cpi->gf_bad_count = 0; // Gf useage has picked up so reset count + } + } + // If the signal is set but has not been read should we cancel it. + else if (last_ref_zz_useage < 15) + { + cpi->gf_update_recommended = 0; + cpi->gf_bad_count = 0; + } + +#if 0 + + if (0) + { + FILE *f = fopen("gfneeded.stt", "a"); + fprintf(f, "%10d %10d %10d %10d %10ld \n", + cm->current_video_frame, + cpi->common.frames_since_golden, + gf_active_pct, gf_ref_usage_pct, + cpi->gf_update_recommended); + fclose(f); + } + +#endif +} + +#if !(CONFIG_REALTIME_ONLY) +static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) +{ + double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + + if (!cpi->common.refresh_alt_ref_frame) + vp8_second_pass(cpi); + + encode_frame_to_data_rate(cpi, size, dest, frame_flags); + cpi->bits_left -= 8 * *size; + + if (!cpi->common.refresh_alt_ref_frame) + cpi->bits_left += (long long)(two_pass_min_rate / cpi->oxcf.frame_rate); +} +#endif + +//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us. +#if HAVE_ARMV7 +extern void vp8_push_neon(INT64 *store); +extern void vp8_pop_neon(INT64 *store); +static INT64 store_reg[8]; +#endif +int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time) +{ + VP8_COMP *cpi = (VP8_COMP *) ptr; + VP8_COMMON *cm = &cpi->common; + struct vpx_usec_timer timer; + + if (!cpi) + return -1; + +#if HAVE_ARMV7 + vp8_push_neon(store_reg); +#endif + + vpx_usec_timer_start(&timer); + + // no more room for frames; + if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames) + { +#if HAVE_ARMV7 + vp8_pop_neon(store_reg); +#endif + return -1; + } + + //printf("in-cpi->source_buffer_count: %d\n", cpi->source_buffer_count); + + cm->clr_type = sd->clrtype; + + // make a copy of the frame for use later... +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->oxcf.allow_lag) + { + int which_buffer = cpi->source_encode_index - 1; + SOURCE_SAMPLE *s; + + if (which_buffer == -1) + which_buffer = cpi->oxcf.lag_in_frames - 1; + + if (cpi->source_buffer_count < cpi->oxcf.lag_in_frames - 1) + which_buffer = cpi->source_buffer_count; + + s = &cpi->src_buffer[which_buffer]; + + s->source_time_stamp = time_stamp; + s->source_end_time_stamp = end_time; + s->source_frame_flags = frame_flags; + vp8_yv12_copy_frame_ptr(sd, &s->source_buffer); + + cpi->source_buffer_count ++; + } + else +#endif + { + SOURCE_SAMPLE *s; + s = &cpi->src_buffer[0]; + s->source_end_time_stamp = end_time; + s->source_time_stamp = time_stamp; + s->source_frame_flags = frame_flags; +#if HAVE_ARMV7 + vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer); +#else + vp8_yv12_copy_frame_ptr(sd, &s->source_buffer); +#endif + cpi->source_buffer_count = 1; + } + + vpx_usec_timer_mark(&timer); + cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); + +#if HAVE_ARMV7 + vp8_pop_neon(store_reg); +#endif + + return 0; +} +int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush) +{ + + VP8_COMP *cpi = (VP8_COMP *) ptr; + VP8_COMMON *cm = &cpi->common; + struct vpx_usec_timer tsctimer; + struct vpx_usec_timer ticktimer; + struct vpx_usec_timer cmptimer; + + if (!cpi) + return -1; + +#if HAVE_ARMV7 + vp8_push_neon(store_reg); +#endif + + vpx_usec_timer_start(&cmptimer); + + + // flush variable tells us that even though we have less than 10 frames + // in our buffer we need to start producing compressed frames. + // Probably because we are at the end of a file.... + if ((cpi->source_buffer_count == cpi->oxcf.lag_in_frames && cpi->oxcf.lag_in_frames > 0) + || (!cpi->oxcf.allow_lag && cpi->source_buffer_count > 0) + || (flush && cpi->source_buffer_count > 0)) + { + + SOURCE_SAMPLE *s; + + s = &cpi->src_buffer[cpi->source_encode_index]; + cpi->source_time_stamp = s->source_time_stamp; + cpi->source_end_time_stamp = s->source_end_time_stamp; + +#if !(CONFIG_REALTIME_ONLY) + + // Should we code an alternate reference frame + if (cpi->oxcf.error_resilient_mode == 0 && + cpi->oxcf.play_alternate && + cpi->source_alt_ref_pending && + (cpi->frames_till_gf_update_due < cpi->source_buffer_count) && + cpi->oxcf.lag_in_frames != 0) + { + cpi->last_alt_ref_sei = (cpi->source_encode_index + cpi->frames_till_gf_update_due) % cpi->oxcf.lag_in_frames; + +#if VP8_TEMPORAL_ALT_REF + + if (cpi->oxcf.arnr_max_frames > 0) + { +#if 0 + // my attempt at a loop that tests the results of strength filter. + int start_frame = cpi->last_alt_ref_sei - 3; + + int i, besti = -1, pastin = cpi->oxcf.arnr_strength; + + int besterr; + + if (start_frame < 0) + start_frame += cpi->oxcf.lag_in_frames; + + besterr = vp8_calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer, + &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance)); + + for (i = 0; i < 7; i++) + { + int thiserr; + cpi->oxcf.arnr_strength = i; + vp8cx_temp_filter_c(cpi); + + thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer, + &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance)); + + if (10 * thiserr < besterr * 8) + { + besterr = thiserr; + besti = i; + } + } + + if (besti != -1) + { + cpi->oxcf.arnr_strength = besti; + vp8cx_temp_filter_c(cpi); + s = &cpi->alt_ref_buffer; + + // FWG not sure if I need to copy this data for the Alt Ref frame + s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp; + s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp; + s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags; + } + else + s = &cpi->src_buffer[cpi->last_alt_ref_sei]; + +#else + vp8cx_temp_filter_c(cpi); + s = &cpi->alt_ref_buffer; + + // FWG not sure if I need to copy this data for the Alt Ref frame + s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp; + s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp; + s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags; + +#endif + } + else +#endif + s = &cpi->src_buffer[cpi->last_alt_ref_sei]; + + cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; + cm->refresh_alt_ref_frame = 1; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 0; + cm->show_frame = 0; + cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag. + cpi->is_src_frame_alt_ref = 0; + } + else +#endif + { + cm->show_frame = 1; +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->oxcf.allow_lag) + { + if (cpi->source_encode_index == cpi->last_alt_ref_sei) + { +#if VP8_TEMPORAL_ALT_REF + + if (cpi->oxcf.arnr_max_frames == 0) + { + cpi->is_src_frame_alt_ref = 1; // copy alt ref + } + else + { + cpi->is_src_frame_alt_ref = 0; + } + +#else + cpi->is_src_frame_alt_ref = 1; +#endif + cpi->last_alt_ref_sei = -1; + } + else + cpi->is_src_frame_alt_ref = 0; + + cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames; + } + +#endif + cpi->source_buffer_count--; + } + + cpi->un_scaled_source = &s->source_buffer; + cpi->Source = &s->source_buffer; + cpi->source_frame_flags = s->source_frame_flags; + + *time_stamp = cpi->source_time_stamp; + *time_end = cpi->source_end_time_stamp; + } + else + { + *size = 0; +#if !(CONFIG_REALTIME_ONLY) + + if (flush && cpi->pass == 1 && !cpi->first_pass_done) + { + vp8_end_first_pass(cpi); /* get last stats packet */ + cpi->first_pass_done = 1; + } + +#endif + +#if HAVE_ARMV7 + vp8_pop_neon(store_reg); +#endif + return -1; + } + + *frame_flags = cpi->source_frame_flags; + +#if CONFIG_PSNR + + if (cpi->source_time_stamp < cpi->first_time_stamp_ever) + cpi->first_time_stamp_ever = cpi->source_time_stamp; + +#endif + + // adjust frame rates based on timestamps given + if (!cm->refresh_alt_ref_frame) + { + if (cpi->last_time_stamp_seen == 0) + { + double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp); + + vp8_new_frame_rate(cpi, this_fps); + } + else + { + long long nanosecs = cpi->source_time_stamp - cpi->last_time_stamp_seen; + double this_fps = 10000000.000 / nanosecs; + + vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8); + + } + + cpi->last_time_stamp_seen = cpi->source_time_stamp; + } + + if (cpi->compressor_speed == 2) + { + vp8_check_gf_quality(cpi); + } + + if (!cpi) + { +#if HAVE_ARMV7 + vp8_pop_neon(store_reg); +#endif + return 0; + } + + if (cpi->compressor_speed == 2) + { + vpx_usec_timer_start(&tsctimer); + vpx_usec_timer_start(&ticktimer); + } + + // start with a 0 size frame + *size = 0; + + // Clear down mmx registers + vp8_clear_system_state(); //__asm emms; + + cm->frame_type = INTER_FRAME; + cm->frame_flags = *frame_flags; + +#if 0 + + if (cm->refresh_alt_ref_frame) + { + //cm->refresh_golden_frame = 1; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 0; + } + else + { + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + } + +#endif + +#if !(CONFIG_REALTIME_ONLY) + + if (cpi->pass == 1) + { + Pass1Encode(cpi, size, dest, frame_flags); + } + else if (cpi->pass == 2) + { + Pass2Encode(cpi, size, dest, frame_flags); + } + else +#endif + encode_frame_to_data_rate(cpi, size, dest, frame_flags); + + if (cpi->compressor_speed == 2) + { + unsigned int duration, duration2; + vpx_usec_timer_mark(&tsctimer); + vpx_usec_timer_mark(&ticktimer); + + duration = vpx_usec_timer_elapsed(&ticktimer); + duration2 = (unsigned int)((double)duration / 2); + + if (cm->frame_type != KEY_FRAME) + { + if (cpi->avg_encode_time == 0) + cpi->avg_encode_time = duration; + else + cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3; + } + + if (duration2) + { + //if(*frame_flags!=1) + { + + if (cpi->avg_pick_mode_time == 0) + cpi->avg_pick_mode_time = duration2; + else + cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3; + } + } + + } + + if (cm->refresh_entropy_probs == 0) + { + vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc)); + } + + // if its a dropped frame honor the requests on subsequent frames + if (*size > 0) + { + + // return to normal state + cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; + + cm->refresh_entropy_probs = 1; + cm->refresh_alt_ref_frame = 0; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + cm->frame_type = INTER_FRAME; + + } + + cpi->ready_for_new_frame = 1; + + vpx_usec_timer_mark(&cmptimer); + cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); + + if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + generate_psnr_packet(cpi); + +#if CONFIG_PSNR + + if (cpi->pass != 1) + { + cpi->bytes += *size; + + if (cm->show_frame) + { + + cpi->count ++; + + if (cpi->b_calculate_psnr) + { + double y, u, v; + double sq_error; + double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error); + + cpi->total_y += y; + cpi->total_u += u; + cpi->total_v += v; + cpi->total_sq_error += sq_error; + cpi->total += frame_psnr; + { + double y2, u2, v2, frame_psnr2, frame_ssim2 = 0; + double weight = 0; + + vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); + vp8_clear_system_state(); + frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); + frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight); + + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + + cpi->totalp_y += y2; + cpi->totalp_u += u2; + cpi->totalp_v += v2; + cpi->totalp += frame_psnr2; + cpi->total_sq_error2 += sq_error; + + } + } + + if (cpi->b_calculate_ssimg) + { + double y, u, v, frame_all; + frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v); + cpi->total_ssimg_y += y; + cpi->total_ssimg_u += u; + cpi->total_ssimg_v += v; + cpi->total_ssimg_all += frame_all; + } + + } + } + +#if 0 + + if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q) + { + skiptruecount += cpi->skip_true_count; + skipfalsecount += cpi->skip_false_count; + } + +#endif +#if 0 + + if (cpi->pass != 1) + { + FILE *f = fopen("skip.stt", "a"); + fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size); + + if (cpi->is_src_frame_alt_ref == 1) + fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size); + + fclose(f); + } + +#endif +#endif + +#if HAVE_ARMV7 + vp8_pop_neon(store_reg); +#endif + + return 0; +} + +int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags) +{ + VP8_COMP *cpi = (VP8_COMP *) comp; + + if (cpi->common.refresh_alt_ref_frame) + return -1; + else + { + int ret; +#if CONFIG_POSTPROC + ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags); +#else + + if (cpi->common.frame_to_show) + { + *dest = *cpi->common.frame_to_show; + dest->y_width = cpi->common.Width; + dest->y_height = cpi->common.Height; + dest->uv_height = cpi->common.Height / 2; + ret = 0; + } + else + { + ret = -1; + } + +#endif //!CONFIG_POSTPROC + vp8_clear_system_state(); + return ret; + } +} + +int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]) +{ + VP8_COMP *cpi = (VP8_COMP *) comp; + signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + + if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols) + return -1; + + if (!map) + { + disable_segmentation((VP8_PTR)cpi); + return 0; + } + + // Set the segmentation Map + set_segmentation_map((VP8_PTR)cpi, map); + + // Activate segmentation. + enable_segmentation((VP8_PTR)cpi); + + // Set up the quant segment data + feature_data[MB_LVL_ALT_Q][0] = delta_q[0]; + feature_data[MB_LVL_ALT_Q][1] = delta_q[1]; + feature_data[MB_LVL_ALT_Q][2] = delta_q[2]; + feature_data[MB_LVL_ALT_Q][3] = delta_q[3]; + + // Set up the loop segment data s + feature_data[MB_LVL_ALT_LF][0] = delta_lf[0]; + feature_data[MB_LVL_ALT_LF][1] = delta_lf[1]; + feature_data[MB_LVL_ALT_LF][2] = delta_lf[2]; + feature_data[MB_LVL_ALT_LF][3] = delta_lf[3]; + + cpi->segment_encode_breakout[0] = threshold[0]; + cpi->segment_encode_breakout[1] = threshold[1]; + cpi->segment_encode_breakout[2] = threshold[2]; + cpi->segment_encode_breakout[3] = threshold[3]; + + // Initialise the feature data structure + // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 + set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); + + return 0; +} + +int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols) +{ + VP8_COMP *cpi = (VP8_COMP *) comp; + + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) + { + if (map) + { + vpx_memcpy(cpi->active_map, map, rows * cols); + cpi->active_map_enabled = 1; + } + else + cpi->active_map_enabled = 0; + + return 0; + } + else + { + //cpi->active_map_enabled = 0; + return -1 ; + } +} + +int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode) +{ + VP8_COMP *cpi = (VP8_COMP *) comp; + + if (horiz_mode >= NORMAL && horiz_mode <= ONETWO) + cpi->common.horiz_scale = horiz_mode; + else + return -1; + + if (vert_mode >= NORMAL && vert_mode <= ONETWO) + cpi->common.vert_scale = vert_mode; + else + return -1; + + return 0; +} + + + +int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd) +{ + int i, j; + int Total = 0; + + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + (void)rtcd; + + // Loop through the Y plane raw and reconstruction data summing (square differences) + for (i = 0; i < source->y_height; i += 16) + { + for (j = 0; j < source->y_width; j += 16) + { + unsigned int sse; + Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + } + + src += 16 * source->y_stride; + dst += 16 * dest->y_stride; + } + + return Total; +} +int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd) +{ + int i, j; + int Total = 0; + + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + (void)rtcd; + + // Loop through the Y plane raw and reconstruction data summing (square differences) + for (i = 0; i < source->y_height; i += 16) + { + for (j = 0; j < source->y_width; j += 16) + { + unsigned int sse, sse2, sum2; + VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + + if (sse < 8096) + Total += sse; + } + + src += 16 * source->y_stride; + dst += 16 * dest->y_stride; + } + + return Total; +} + +int vp8_get_speed(VP8_PTR c) +{ + VP8_COMP *cpi = (VP8_COMP *) c; + return cpi->Speed; +} +int vp8_get_quantizer(VP8_PTR c) +{ + VP8_COMP *cpi = (VP8_COMP *) c; + return cpi->common.base_qindex; +} diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h new file mode 100644 index 000000000..29b120ed4 --- /dev/null +++ b/vp8/encoder/onyx_int.h @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_VP8_INT_H +#define __INC_VP8_INT_H + +#include <stdio.h> +#include "vpx_ports/config.h" +#include "onyx.h" +#include "treewriter.h" +#include "tokenize.h" +#include "onyxc_int.h" +#include "preproc.h" +#include "variance.h" +#include "dct.h" +#include "encodemb.h" +#include "quantize.h" +#include "entropy.h" +#include "threading.h" +#include "vpx_ports/mem.h" +#include "vpx_codec/internal/vpx_codec_internal.h" +#include "mcomp.h" + +#define INTRARDOPT +//#define SPEEDSTATS 1 +#define MIN_GF_INTERVAL 4 +#define DEFAULT_GF_INTERVAL 7 + +#define KEY_FRAME_CONTEXT 5 + +#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25) + +#define AF_THRESH 25 +#define AF_THRESH2 100 +#define ARF_DECAY_THRESH 12 +#define MAX_MODES 20 + +#define MIN_THRESHMULT 32 +#define MAX_THRESHMULT 512 + +#define GF_ZEROMV_ZBIN_BOOST 24 +#define ZBIN_OQ_MAX 192 + +#define VP8_TEMPORAL_ALT_REF 1 + +typedef struct +{ + int kf_indicated; + unsigned int frames_since_key; + unsigned int frames_since_golden; + int filter_level; + int frames_till_gf_update_due; + int recent_ref_frame_usage[MAX_REF_FRAMES]; + + MV_CONTEXT mvc[2]; + int mvcosts[2][MVvals+1]; + +#ifdef MODE_STATS + // Stats + int y_modes[5]; + int uv_modes[4]; + int b_modes[10]; + int inter_y_modes[10]; + int inter_uv_modes[4]; + int inter_b_modes[10]; +#endif + + vp8_prob ymode_prob[4], uv_mode_prob[3]; /* interframe intra mode probs */ + vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3]; /* keyframe "" */ + + int ymode_count[5], uv_mode_count[4]; /* intra MB type cts this frame */ + + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + + int this_frame_percent_intra; + int last_frame_percent_intra; + + +} CODING_CONTEXT; + +typedef struct +{ + double frame; + double intra_error; + double coded_error; + double ssim_weighted_pred_err; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; +} +FIRSTPASS_STATS; + +typedef struct +{ + int frames_so_far; + double frame_intra_error; + double frame_coded_error; + double frame_pcnt_inter; + double frame_pcnt_motion; + double frame_mvr; + double frame_mvr_abs; + double frame_mvc; + double frame_mvc_abs; + +} ONEPASS_FRAMESTATS; + + +typedef enum +{ + THR_ZEROMV = 0, + THR_DC = 1, + + THR_NEARESTMV = 2, + THR_NEARMV = 3, + + THR_ZEROG = 4, + THR_NEARESTG = 5, + + THR_ZEROA = 6, + THR_NEARESTA = 7, + + THR_NEARG = 8, + THR_NEARA = 9, + + THR_V_PRED = 10, + THR_H_PRED = 11, + THR_TM = 12, + + THR_NEWMV = 13, + THR_NEWG = 14, + THR_NEWA = 15, + + THR_SPLITMV = 16, + THR_SPLITG = 17, + THR_SPLITA = 18, + + THR_B_PRED = 19, +} +THR_MODES; + +typedef enum +{ + DIAMOND = 0, + NSTEP = 1, + HEX = 2 +} SEARCH_METHODS; + +typedef struct +{ + int RD; + SEARCH_METHODS search_method; + int improved_quant; + int improved_dct; + int auto_filter; + int recode_loop; + int iterative_sub_pixel; + int half_pixel_search; + int quarter_pixel_search; + int thresh_mult[MAX_MODES]; + int full_freq[2]; + int min_fs_radius; + int max_fs_radius; + int max_step_search_steps; + int first_step; + int optimize_coefficients; + +} SPEED_FEATURES; + +typedef struct +{ + MACROBLOCK mb; + int mb_row; + TOKENEXTRA *tp; + int segment_counts[MAX_MB_SEGMENTS]; + int totalrate; + int current_mb_col; +} MB_ROW_COMP; + +typedef struct +{ + TOKENEXTRA *start; + TOKENEXTRA *stop; +} TOKENLIST; + +typedef struct +{ + int ithread; + void *ptr1; + void *ptr2; +} ENCODETHREAD_DATA; +typedef struct +{ + int ithread; + void *ptr1; +} LPFTHREAD_DATA; + +typedef struct +{ + INT64 source_time_stamp; + INT64 source_end_time_stamp; + + DECLARE_ALIGNED(16, YV12_BUFFER_CONFIG, source_buffer); + unsigned int source_frame_flags; +} SOURCE_SAMPLE; + +typedef struct VP8_ENCODER_RTCD +{ + VP8_COMMON_RTCD *common; + vp8_variance_rtcd_vtable_t variance; + vp8_fdct_rtcd_vtable_t fdct; + vp8_encodemb_rtcd_vtable_t encodemb; + vp8_quantize_rtcd_vtable_t quantize; + vp8_search_rtcd_vtable_t search; +} VP8_ENCODER_RTCD; + +typedef struct +{ + + DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][4][4]); + + DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][4][4]); + + DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][4][4]); + DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][4][4]); + + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); + + + MACROBLOCK mb; + VP8_COMMON common; + vp8_writer bc, bc2; + // bool_writer *bc2; + + VP8_CONFIG oxcf; + + YV12_BUFFER_CONFIG *Source; + YV12_BUFFER_CONFIG *un_scaled_source; + INT64 source_time_stamp; + INT64 source_end_time_stamp; + unsigned int source_frame_flags; + YV12_BUFFER_CONFIG scaled_source; + + int source_buffer_count; + int source_encode_index; + int source_alt_ref_pending; + int source_alt_ref_active; + + int last_alt_ref_sei; + int is_src_frame_alt_ref; + + int gold_is_last; // golden frame same as last frame ( short circuit gold searches) + int alt_is_last; // Alt reference frame same as last ( short circuit altref search) + int gold_is_alt; // don't do both alt and gold search ( just do gold). + + //int refresh_alt_ref_frame; + SOURCE_SAMPLE src_buffer[MAX_LAG_BUFFERS]; + + YV12_BUFFER_CONFIG last_frame_uf; + + char *Dest; + + TOKENEXTRA *tok; + unsigned int tok_count; + + + unsigned int frames_since_key; + unsigned int key_frame_frequency; + unsigned int next_key; + + unsigned int mode_check_freq[MAX_MODES]; + unsigned int mode_test_hit_counts[MAX_MODES]; + unsigned int mode_chosen_counts[MAX_MODES]; + unsigned int mbs_tested_so_far; + + unsigned int check_freq[2]; + unsigned int do_full[2]; + + int rd_thresh_mult[MAX_MODES]; + int rd_baseline_thresh[MAX_MODES]; + int rd_threshes[MAX_MODES]; + int mvcostbase; + int mvcostmultiplier; + int subseqblockweight; + int errthresh; + +#ifdef INTRARDOPT + int RDMULT; + int RDDIV ; + + TOKENEXTRA *rdtok; + int intra_rd_opt; + vp8_writer rdbc; + int intra_mode_costs[10]; +#endif + + + CODING_CONTEXT coding_context; + + // Rate targetting variables + long long prediction_error; + long long last_prediction_error; + long long intra_error; + long long last_intra_error; + long long last_auto_filter_prediction_error; + +#if 0 + // Experimental RD code + long long frame_distortion; + long long last_frame_distortion; +#endif + + int last_mb_distortion; + + int frames_since_auto_filter; + + int this_frame_target; + int projected_frame_size; + int last_q[2]; // Separate values for Intra/Inter + int target_bits_per_mb; + + double rate_correction_factor; + double key_frame_rate_correction_factor; + double gf_rate_correction_factor; + double est_max_qcorrection_factor; + + int frames_till_gf_update_due; // Count down till next GF + int current_gf_interval; // GF interval chosen when we coded the last GF + + int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative) + + int gf_group_bits; // Projected Bits available for a group of frames including 1 GF or ARF + int gf_bits; // Bits for the golden frame or ARF - 2 pass only + int mid_gf_extra_bits; // A few extra bits for the frame half way between two gfs. + + int kf_group_bits; // Projected total bits available for a key frame group of frames + int kf_group_error_left; // Error score of frames still to be coded in kf group + int kf_bits; // Bits for the key frame in a key frame group - 2 pass only + + int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF + int initial_gf_use; // percentage use of gf 2 frames after gf + + int gf_group_error_left; // Remaining error from uncoded frames in a gf group. Two pass use only + + int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames + int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame. + int max_gf_interval; + int baseline_gf_interval; + int gf_decay_rate; + + INT64 key_frame_count; + INT64 tot_key_frame_bits; + int prior_key_frame_size[KEY_FRAME_CONTEXT]; + int prior_key_frame_distance[KEY_FRAME_CONTEXT]; + int per_frame_bandwidth; // Current section per frame bandwidth target + int av_per_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation that should be used for any frame + int last_key_frame_size; + int intra_frame_target; + int inter_frame_target; + double output_frame_rate; + long long last_time_stamp_seen; + long long first_time_stamp_ever; + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex; + + int zbin_over_quant; + int zbin_mode_boost; + int zbin_mode_boost_enabled; + + INT64 total_byte_count; + + int buffered_mode; + + int buffer_level; + int bits_off_target; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + long long total_actual_bits; + int total_target_vs_actual; // debug stats + + int worst_quality; + int active_worst_quality; + int best_quality; + int active_best_quality; + + int drop_frames_allowed; // Are we permitted to drop frames? + int drop_frame; // Drop this frame? + int drop_count; // How many frames have we dropped? + int max_drop_count; // How many frames should we drop? + int max_consec_dropped_frames; // Limit number of consecutive frames that can be dropped. + + + int ymode_count [VP8_YMODES]; /* intra MB type cts this frame */ + int uv_mode_count[VP8_UV_MODES]; /* intra MB type cts this frame */ + + unsigned int MVcount [2] [MVvals]; /* (row,col) MV cts this frame */ + + unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]; /* for this frame */ + //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]); //not used any more + //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation + vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1]; + unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1][2]; + + /* Second compressed data partition contains coefficient data. */ + + unsigned char *output_partition2; + size_t output_partition2size; + + pre_proc_instance ppi; + + int frames_to_key; + int gfu_boost; + int kf_boost; + int last_boost; + double total_error_left; + double total_intra_error_left; + double total_coded_error_left; + double start_tot_err_left; + double min_error; + + double modified_total_error_left; + double avg_iiratio; + + int target_bandwidth; + long long bits_left; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + FIRSTPASS_STATS *stats_in, *stats_in_end; + struct vpx_codec_pkt_list *output_pkt_list; + int first_pass_done; + unsigned char *fp_motion_map; + FILE *fp_motion_mapfile; + int fpmm_pos; + +#if 0 + // Experimental code for lagged and one pass + ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS]; + int one_pass_frame_index; +#endif + + int decimation_factor; + int decimation_count; + + // for real time encoding + int avg_encode_time; //microsecond + int avg_pick_mode_time; //microsecond + int Speed; + unsigned int cpu_freq; //Mhz + int compressor_speed; + + int interquantizer; + int auto_gold; + int auto_adjust_gold_quantizer; + int goldquantizer; + int goldfreq; + int auto_adjust_key_quantizer; + int keyquantizer; + int auto_worst_q; + int filter_type; + int cpu_used; + int chroma_boost; + int horiz_scale; + int vert_scale; + int pass; + + + int prob_intra_coded; + int prob_last_coded; + int prob_gf_coded; + int prob_skip_false; + int last_skip_false_probs[3]; + int last_skip_probs_q[3]; + int recent_ref_frame_usage[MAX_REF_FRAMES]; + + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + int this_frame_percent_intra; + int last_frame_percent_intra; + + int last_key_frame_q; + int last_kffilt_lvl; + + int ref_frame_flags; + + int exp[512]; + + SPEED_FEATURES sf; + int error_bins[1024]; + + int inter_lvl; + int intra_lvl; + int motion_lvl; + int motion_speed; + int motion_var; + int next_iiratio; + int this_iiratio; + int this_frame_modified_error; + + double norm_intra_err_per_mb; + double norm_inter_err_per_mb; + double norm_iidiff_per_mb; + + int last_best_mode_index; // Record of mode index chosen for previous macro block. + int last_auto_filt_val; + int last_auto_filt_q; + + // Data used for real time conferencing mode to help determine if it would be good to update the gf + int inter_zz_count; + int gf_bad_count; + int gf_update_recommended; + int skip_true_count; + int skip_false_count; + + int alt_qcount; + + int ready_for_new_frame; + + unsigned char *segmentation_map; + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; // Segment data (can be deltas or absolute values) + int segment_encode_breakout[MAX_MB_SEGMENTS]; // segment threashold for encode breakout + + unsigned char *active_map; + unsigned int active_map_enabled; + // Video conferencing cyclic refresh mode flags etc + // This is a mode designed to clean up the background over time in live encoding scenarious. It uses segmentation + int cyclic_refresh_mode_enabled; + int cyclic_refresh_mode_max_mbs_perframe; + int cyclic_refresh_mode_index; + int cyclic_refresh_q; + signed char *cyclic_refresh_map; + + // multithread data + int current_mb_col_main; + int processor_core_count; + int b_multi_threaded; + int encoding_thread_count; + +#if CONFIG_MULTITHREAD + pthread_t *h_encoding_thread; +#endif + MB_ROW_COMP *mb_row_ei; + ENCODETHREAD_DATA *en_thread_data; + +#if CONFIG_MULTITHREAD + //events + sem_t *h_event_mbrencoding; + sem_t h_event_main; +#endif + + TOKENLIST *tplist; + // end of multithread data + + + fractional_mv_step_fp *find_fractional_mv_step; + vp8_full_search_fn_t full_search_sad; + vp8_diamond_search_fn_t diamond_search_sad; + vp8_variance_fn_ptr_t fn_ptr; + unsigned int time_receive_data; + unsigned int time_compress_data; + unsigned int time_pick_lpf; + unsigned int time_encode_mb_row; + + unsigned int tempdata1; + unsigned int tempdata2; + + int base_skip_false_prob[128]; + unsigned int section_is_low_motion; + unsigned int section_benefits_from_aggresive_q; + unsigned int section_is_fast_motion; + unsigned int section_intra_rating; + + double section_max_qfactor; + + +#if CONFIG_RUNTIME_CPU_DETECT + VP8_ENCODER_RTCD rtcd; +#endif +#if VP8_TEMPORAL_ALT_REF + SOURCE_SAMPLE alt_ref_buffer; + unsigned char *frames[MAX_LAG_BUFFERS]; + int fixed_divide[255]; +#endif + +#if CONFIG_PSNR + int count; + double total_y; + double total_u; + double total_v; + double total ; + double total_sq_error; + double totalp_y; + double totalp_u; + double totalp_v; + double totalp; + double total_sq_error2; + int bytes; + double summed_quality; + double summed_weights; + unsigned int tot_recode_hits; + + + double total_ssimg_y; + double total_ssimg_u; + double total_ssimg_v; + double total_ssimg_all; + + int b_calculate_ssimg; +#endif + int b_calculate_psnr; +} VP8_COMP; + +void control_data_rate(VP8_COMP *cpi); + +void vp8_encode_frame(VP8_COMP *cpi); + +void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size); + +int rd_cost_intra_mb(MACROBLOCKD *x); + +void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **); + +void vp8_set_speed_features(VP8_COMP *cpi); + +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(lval,expr) do {\ + lval = (expr); \ + if(!lval) \ + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\ + "Failed to allocate "#lval" at %s:%d", \ + __FILE__,__LINE__);\ + } while(0) +#else +#define CHECK_MEM_ERROR(lval,expr) do {\ + lval = (expr); \ + if(!lval) \ + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\ + "Failed to allocate "#lval);\ + } while(0) +#endif +#endif diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp new file mode 100644 index 000000000..66fdafb1a --- /dev/null +++ b/vp8/encoder/parms.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if 0 + +#include <map> +#include <string> +#include <fstream> +extern "C" +{ + #include "onyx.h" +} + + +using namespace std; + +typedef map<string,int> Parms; + +#define ALLPARMS(O,DOTHIS) \ + DOTHIS(O, interquantizer )\ + DOTHIS(O, auto_gold )\ + DOTHIS(O, auto_adjust_gold_quantizer )\ + DOTHIS(O, goldquantizer )\ + DOTHIS(O, goldfreq )\ + DOTHIS(O, auto_key )\ + DOTHIS(O, auto_adjust_key_quantizer )\ + DOTHIS(O, keyquantizer )\ + DOTHIS(O, keyfreq )\ + DOTHIS(O, pass )\ + DOTHIS(O, fixed_q )\ + DOTHIS(O, target_bandwidth )\ + DOTHIS(O, auto_worst_q )\ + DOTHIS(O, worst_quality )\ + DOTHIS(O, best_allowed_q )\ + DOTHIS(O, end_usage )\ + DOTHIS(O, starting_buffer_level )\ + DOTHIS(O, optimal_buffer_level )\ + DOTHIS(O, maximum_buffer_size )\ + DOTHIS(O, under_shoot_pct )\ + DOTHIS(O, allow_df )\ + DOTHIS(O, drop_frames_water_mark )\ + DOTHIS(O, max_allowed_datarate )\ + DOTHIS(O, two_pass_vbrbias )\ + DOTHIS(O, two_pass_vbrmin_section )\ + DOTHIS(O, two_pass_vbrmax_section )\ + DOTHIS(O, filter_type )\ + DOTHIS(O, compressor_speed )\ + DOTHIS(O, mbpitch_feature )\ + DOTHIS(O, allow_spatial_resampling )\ + DOTHIS(O, resample_down_water_mark )\ + DOTHIS(O, resample_up_water_mark )\ + DOTHIS(O, noise_sensitivity )\ + DOTHIS(O, horiz_scale )\ + DOTHIS(O, vert_scale ) + + +#define GET(O,V) O->V = x[#V]; +#define PUT(O,V) x[#V] = O->V; + + +extern "C" void get_parms(VP8_CONFIG *ocf,char *filename) +{ + + Parms x; + int value; + string variable; + string equal; + + ifstream config_file(filename); + + ALLPARMS(ocf, PUT); + + // store all the parms in a map (really simple parsing) + while(!config_file.eof() && config_file.is_open()) + { + config_file >> variable; + config_file >> equal; + + if(equal != "=") + continue; + + config_file >> value; + + x[variable] = value; + } + + ALLPARMS(ocf, GET); + +} + +#define PRINT(O,V) debug_file<<#V <<" = " << O->V <<"\n"; +extern "C" void print_parms(VP8_CONFIG *ocf,char *filename) +{ + ofstream debug_file(filename,ios_base::app); + ALLPARMS(ocf, PRINT); + debug_file << "=============================================="<<"\n"; +} + +#endif diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c new file mode 100644 index 000000000..d61e2ceda --- /dev/null +++ b/vp8/encoder/pickinter.c @@ -0,0 +1,923 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <limits.h> +#include "vpx_ports/config.h" +#include "onyx_int.h" +#include "modecosts.h" +#include "encodeintra.h" +#include "entropymode.h" +#include "pickinter.h" +#include "findnearmv.h" +#include "encodemb.h" +#include "reconinter.h" +#include "reconintra.h" +#include "reconintra4x4.h" +#include "g_common.h" +#include "variance.h" +#include "mcomp.h" + +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif + +extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd); + +#ifdef SPEEDSTATS +extern unsigned int cnt_pm; +#endif + +extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES]; +extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; + + +extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); +extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel); +extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); +extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv); + + +int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]) +{ + (void) b; + (void) d; + (void) ref_mv; + (void) error_per_bit; + (void) svf; + (void) vf; + (void) mvcost; + bestmv->row <<= 3; + bestmv->col <<= 3; + return 0; +} + + +static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, unsigned int *sse) +{ + + BLOCK *b = &mb->block[0]; + BLOCKD *d = &mb->e_mbd.block[0]; + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what = *(d->base_pre) + d->pre ; + int in_what_stride = d->pre_stride; + int xoffset = d->bmi.mv.as_mv.col & 7; + int yoffset = d->bmi.mv.as_mv.row & 7; + + in_what += (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (xoffset | yoffset) + { + return svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse); + } + else + { + return vf(what, what_stride, in_what, in_what_stride, sse); + } + +} + +unsigned int vp8_get16x16pred_error_c +( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad +) +{ + unsigned pred_error = 0; + int i, j; + int sum = 0; + + for (i = 0; i < 16; i++) + { + int diff; + + for (j = 0; j < 16; j++) + { + diff = src_ptr[j] - ref_ptr[j]; + sum += diff; + pred_error += diff * diff; + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + pred_error -= sum * sum / 256; + return pred_error; +} + + +unsigned int vp8_get4x4sse_cs_c +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + int max_sad +) +{ + int distortion = 0; + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int diff = src_ptr[c] - ref_ptr[c]; + distortion += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } + + return distortion; +} + +static int get_prediction_error(BLOCK *be, BLOCKD *b, const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned char *sptr; + unsigned char *dptr; + sptr = (*(be->base_src) + be->src); + dptr = b->predictor; + + return VARIANCE_INVOKE(rtcd, get4x4sse_cs)(sptr, be->src_stride, dptr, 16, 0x7fffffff); + +} + +static int pick_intra4x4block( + const VP8_ENCODER_RTCD *rtcd, + MACROBLOCK *x, + BLOCK *be, + BLOCKD *b, + B_PREDICTION_MODE *best_mode, + B_PREDICTION_MODE above, + B_PREDICTION_MODE left, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + + int *bestrate, + int *bestdistortion) +{ + B_PREDICTION_MODE mode; + int best_rd = INT_MAX; // 1<<30 + int rate; + int distortion; + unsigned int *mode_costs; + (void) l; + (void) a; + + if (x->e_mbd.frame_type == KEY_FRAME) + { + mode_costs = x->bmode_costs[above][left]; + } + else + { + mode_costs = x->inter_bmode_costs; + } + + for (mode = B_DC_PRED; mode <= B_HE_PRED /*B_HU_PRED*/; mode++) + { + int this_rd; + + rate = mode_costs[mode]; + vp8_predict_intra4x4(b, mode, b->predictor); + distortion = get_prediction_error(be, b, &rtcd->variance); + this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) + { + *bestrate = rate; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + } + } + + b->bmi.mode = (B_PREDICTION_MODE)(*best_mode); + vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode); + return best_rd; +} + + +int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int *Rate, int *best_dist) +{ + MACROBLOCKD *const xd = &mb->e_mbd; + int i; + TEMP_CONTEXT t; + int cost = mb->mbmode_cost [xd->frame_type] [B_PRED]; + int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode + int distortion = 0; + + vp8_intra_prediction_down_copy(xd); + vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4); + + for (i = 0; i < 16; i++) + { + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode; + const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode; + B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); + int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d); + + error += pick_intra4x4block(rtcd, + mb, mb->block + i, xd->block + i, &best_mode, A, L, + t.a + vp8_block2above[i], + t.l + vp8_block2left[i], &r, &d); + + cost += r; + distortion += d; + + mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode; + + // Break out case where we have already exceeded best so far value that was bassed in + if (distortion > *best_dist) + break; + } + + for (i = 0; i < 16; i++) + xd->block[i].bmi.mv.as_int = 0; + + *Rate = cost; + + if (i == 16) + *best_dist = distortion; + else + *best_dist = INT_MAX; + + + return error; +} + +int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb) +{ + + MACROBLOCKD *x = &mb->e_mbd; + unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride; + unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride; + unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src); + unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src); + int uvsrc_stride = mb->block[16].src_stride; + unsigned char uleft_col[8]; + unsigned char vleft_col[8]; + unsigned char utop_left = uabove_row[-1]; + unsigned char vtop_left = vabove_row[-1]; + int i, j; + int expected_udc; + int expected_vdc; + int shift; + int Uaverage = 0; + int Vaverage = 0; + int diff; + int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX; + MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); + + + for (i = 0; i < 8; i++) + { + uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1]; + vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1]; + } + + if (!x->up_available && !x->left_available) + { + expected_udc = 128; + expected_vdc = 128; + } + else + { + shift = 2; + + if (x->up_available) + { + + for (i = 0; i < 8; i++) + { + Uaverage += uabove_row[i]; + Vaverage += vabove_row[i]; + } + + shift ++; + + } + + if (x->left_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uleft_col[i]; + Vaverage += vleft_col[i]; + } + + shift ++; + + } + + expected_udc = (Uaverage + (1 << (shift - 1))) >> shift; + expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift; + } + + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + + int predu = uleft_col[i] + uabove_row[j] - utop_left; + int predv = vleft_col[i] + vabove_row[j] - vtop_left; + int u_p, v_p; + + u_p = usrc_ptr[j]; + v_p = vsrc_ptr[j]; + + if (predu < 0) + predu = 0; + + if (predu > 255) + predu = 255; + + if (predv < 0) + predv = 0; + + if (predv > 255) + predv = 255; + + + diff = u_p - expected_udc; + pred_error[DC_PRED] += diff * diff; + diff = v_p - expected_vdc; + pred_error[DC_PRED] += diff * diff; + + + diff = u_p - uabove_row[j]; + pred_error[V_PRED] += diff * diff; + diff = v_p - vabove_row[j]; + pred_error[V_PRED] += diff * diff; + + + diff = u_p - uleft_col[i]; + pred_error[H_PRED] += diff * diff; + diff = v_p - vleft_col[i]; + pred_error[H_PRED] += diff * diff; + + + diff = u_p - predu; + pred_error[TM_PRED] += diff * diff; + diff = v_p - predv; + pred_error[TM_PRED] += diff * diff; + + + } + + usrc_ptr += uvsrc_stride; + vsrc_ptr += uvsrc_stride; + + if (i == 3) + { + usrc_ptr = (mb->block[18].src + *mb->block[18].base_src); + vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src); + } + + + + } + + + for (i = DC_PRED; i <= TM_PRED; i++) + { + if (best_error > pred_error[i]) + { + best_error = pred_error[i]; + best_mode = (MB_PREDICTION_MODE)i; + } + } + + + mb->e_mbd.mbmi.uv_mode = best_mode; + return best_error; + +} + + +int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra) +{ + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + B_MODE_INFO best_bmodes[16]; + MB_MODE_INFO best_mbmode; + MV best_ref_mv1; + MV mode_mv[MB_MODE_COUNT]; + MB_PREDICTION_MODE this_mode; + int num00; + int i; + int mdcounts[4]; + int best_rd = INT_MAX; // 1 << 30; + int best_intra_rd = INT_MAX; + int mode_index; + int ref_frame_cost[MAX_REF_FRAMES]; + int rate; + int rate2; + int distortion2; + int bestsme; + //int all_rds[MAX_MODES]; // Experimental debug code. + int best_mode_index = 0; + int sse = INT_MAX; + + MV nearest_mv[4]; + MV near_mv[4]; + MV best_ref_mv[4]; + int MDCounts[4][4]; + unsigned char *y_buffer[4]; + unsigned char *u_buffer[4]; + unsigned char *v_buffer[4]; + + int skip_mode[4] = {0, 0, 0, 0}; + + vpx_memset(mode_mv, 0, sizeof(mode_mv)); + vpx_memset(nearest_mv, 0, sizeof(nearest_mv)); + vpx_memset(near_mv, 0, sizeof(near_mv)); + + + // set up all the refframe dependent pointers. + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME], + &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[LAST_FRAME] = cpi->common.last_frame.y_buffer + recon_yoffset; + u_buffer[LAST_FRAME] = cpi->common.last_frame.u_buffer + recon_uvoffset; + v_buffer[LAST_FRAME] = cpi->common.last_frame.v_buffer + recon_uvoffset; + } + else + skip_mode[LAST_FRAME] = 1; + + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME], + &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.y_buffer + recon_yoffset; + u_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.u_buffer + recon_uvoffset; + v_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.v_buffer + recon_uvoffset; + } + else + skip_mode[GOLDEN_FRAME] = 1; + + if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active) + { + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME], + &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.y_buffer + recon_yoffset; + u_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset; + v_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset; + } + else + skip_mode[ALTREF_FRAME] = 1; + + cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame + + *returnintra = best_intra_rd; + x->skip = 0; + + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded); + + // Special case treatment when GF and ARF are not sensible options for reference + if (cpi->ref_frame_flags == VP8_LAST_FLAG) + { + ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(255); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(255) + + vp8_cost_zero(128); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(255) + + vp8_cost_one(128); + } + else + { + ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(cpi->prob_last_coded); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_zero(cpi->prob_gf_coded); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_one(cpi->prob_gf_coded); + } + + + + best_rd = INT_MAX; + + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + + // if we encode a new mv this is important + // find the best new motion vector + for (mode_index = 0; mode_index < MAX_MODES; mode_index++) + { + int frame_cost; + int this_rd = INT_MAX; + + if (best_rd <= cpi->rd_threshes[mode_index]) + continue; + + x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index]; + + if (skip_mode[x->e_mbd.mbmi.ref_frame]) + continue; + + // Check to see if the testing frequency for this mode is at its max + // If so then prevent it from being tested and increase the threshold for its testing + if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1)) + { + //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] ) + if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])) + { + // Increase the threshold for coding this mode to make it less likely to be chosen + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + + continue; + } + } + + // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested + cpi->mode_test_hit_counts[mode_index] ++; + + rate2 = 0; + distortion2 = 0; + + this_mode = vp8_mode_order[mode_index]; + + // Experimental debug code. + //all_rds[mode_index] = -1; + + x->e_mbd.mbmi.mode = this_mode; + x->e_mbd.mbmi.uv_mode = DC_PRED; + + // Work out the cost assosciated with selecting the reference frame + frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame]; + rate2 += frame_cost; + + // everything but intra + if (x->e_mbd.mbmi.ref_frame) + { + x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mbmi.ref_frame]; + x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mbmi.ref_frame]; + x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mbmi.ref_frame]; + mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mbmi.ref_frame]; + mode_mv[NEARMV] = near_mv[x->e_mbd.mbmi.ref_frame]; + best_ref_mv1 = best_ref_mv[x->e_mbd.mbmi.ref_frame]; + memcpy(mdcounts, MDCounts[x->e_mbd.mbmi.ref_frame], sizeof(mdcounts)); + } + + //Only consider ZEROMV/ALTREF_FRAME for alt ref frame. + if (cpi->is_src_frame_alt_ref) + { + if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME) + continue; + } + + switch (this_mode) + { + case B_PRED: + distortion2 = *returndistortion; // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes + vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2); + rate2 += rate; + distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); + + if (distortion2 == INT_MAX) + { + this_rd = INT_MAX; + } + else + { + this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = best_intra_rd ; + } + } + + break; + + case SPLITMV: + + // Split MV modes currently not supported when RD is nopt enabled. + break; + + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: + vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); + rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode]; + this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = best_intra_rd ; + } + + break; + + case NEWMV: + { + int thissme; + int step_param; + int further_steps; + int n = 0; + int sadpb = x->sadperbit16; + + // Further step/diamond searches as necessary + if (cpi->Speed < 8) + { + step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0); + further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + } + else + { + step_param = cpi->sf.first_step + 2; + further_steps = 0; + } + +#if 0 + + // Initial step Search + bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost); + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + + // Further step searches + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost); + + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + } + else + { + d->bmi.mv.as_mv.row = mode_mv[NEWMV].row; + d->bmi.mv.as_mv.col = mode_mv[NEWMV].col; + } + } + } + +#else + + if (cpi->sf.search_method == HEX) + { + bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost); + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + } + else + { + bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9 + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + + // Further step/diamond searches as necessary + n = 0; + //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9 + + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + } + else + { + d->bmi.mv.as_mv.row = mode_mv[NEWMV].row; + d->bmi.mv.as_mv.col = mode_mv[NEWMV].col; + } + } + } + } + +#endif + } + + if (bestsme < INT_MAX) + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, cpi->fn_ptr.svf, cpi->fn_ptr.vf, cpi->mb.mvcost); + + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + + // mv cost; + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128); + + + case NEARESTMV: + case NEARMV: + + if (mode_mv[this_mode].row == 0 && mode_mv[this_mode].col == 0) + continue; + + case ZEROMV: + + // Trap vectors that reach beyond the UMV borders + // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point + // because of the lack of break statements in the previous two cases. + if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max)) + continue; + + rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + x->e_mbd.mbmi.mode = this_mode; + x->e_mbd.mbmi.mv.as_mv = mode_mv[this_mode]; + x->e_mbd.block[0].bmi.mode = this_mode; + x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mbmi.mv.as_int; + + distortion2 = get_inter_mbpred_error(x, cpi->fn_ptr.svf, cpi->fn_ptr.vf, (unsigned int *)(&sse)); + + this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + + if (cpi->active_map_enabled && x->active_ptr[0] == 0) + { + x->skip = 1; + } + else if (sse < x->encode_breakout) + { + // Check u and v to make sure skip is ok + int sse2 = 0; + + sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance)); + + if (sse2 * 2 < x->encode_breakout) + x->skip = 1; + else + x->skip = 0; + } + + break; + default: + break; + } + + // Experimental debug code. + //all_rds[mode_index] = this_rd; + + if (this_rd < best_rd || x->skip) + { + // Note index of best mode + best_mode_index = mode_index; + + *returnrate = rate2; + *returndistortion = distortion2; + best_rd = this_rd; + vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO)); + + if (this_mode == B_PRED || this_mode == SPLITMV) + for (i = 0; i < 16; i++) + { + vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO)); + } + else + { + best_bmodes[0].mv = x->e_mbd.block[0].bmi.mv; + } + + // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time + cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around. + else + { + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + if (x->skip) + break; + } + + // Reduce the activation RD thresholds for the best choice mode + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) + { + int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3); + + cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; + cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; + } + + // Keep a record of best mode index for use in next loop + cpi->last_best_mode_index = best_mode_index; + + if (best_mbmode.mode <= B_PRED) + { + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + vp8_pick_intra_mbuv_mode(x); + best_mbmode.uv_mode = x->e_mbd.mbmi.uv_mode; + } + + + { + int this_rdbin = (*returndistortion >> 7); + + if (this_rdbin >= 1024) + { + this_rdbin = 1023; + } + + cpi->error_bins[this_rdbin] ++; + } + + + if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) + { + best_mbmode.mode = ZEROMV; + best_mbmode.ref_frame = ALTREF_FRAME; + best_mbmode.mv.as_int = 0; + best_mbmode.uv_mode = 0; + best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; + best_mbmode.partitioning = 0; + best_mbmode.dc_diff = 0; + + vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + for (i = 0; i < 16; i++) + { + vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO)); + } + + x->e_mbd.mbmi.mv.as_int = 0; + + return best_rd; + } + + + // macroblock modes + vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + if (x->e_mbd.mbmi.mode == B_PRED || x->e_mbd.mbmi.mode == SPLITMV) + for (i = 0; i < 16; i++) + { + vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO)); + + } + else + { + vp8_set_mbmode_and_mvs(x, x->e_mbd.mbmi.mode, &best_bmodes[0].mv.as_mv); + } + + x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv; + + return best_rd; +} diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h new file mode 100644 index 000000000..fb28837ed --- /dev/null +++ b/vp8/encoder/pickinter.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_PICKINTER_H +#define __INC_PICKINTER_H +#include "vpx_ports/config.h" +#include "onyxc_int.h" + +#define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) +extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion); +extern int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb); +extern int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); +#endif diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c new file mode 100644 index 000000000..bbd7840b8 --- /dev/null +++ b/vp8/encoder/picklpf.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "onyxc_int.h" +#include "onyx_int.h" +#include "quantize.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12extend.h" +#include "vpx_scale/vpxscale.h" +#include "alloccommon.h" + +extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); +extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val, int sharpness_lvl); +extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd); +#if HAVE_ARMV7 +extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +#endif + +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif + +extern void +(*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, + int Fraction); +void +vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) +{ + unsigned char *src_y, *dst_y; + int yheight; + int ystride; + int border; + int yoffset; + int linestocopy; + + border = src_ybc->border; + yheight = src_ybc->y_height; + ystride = src_ybc->y_stride; + + linestocopy = (yheight >> (Fraction + 4)); + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + yoffset = ystride * ((yheight >> 5) * 16 - 8); + src_y = src_ybc->y_buffer + yoffset; + dst_y = dst_ybc->y_buffer + yoffset; + + vpx_memcpy(dst_y, src_y, ystride *(linestocopy + 16)); +} + +static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int Fraction, const vp8_variance_rtcd_vtable_t *rtcd) +{ + int i, j; + int Total = 0; + int srcoffset, dstoffset; + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + + int linestocopy = (source->y_height >> (Fraction + 4)); + (void)rtcd; + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + + srcoffset = source->y_stride * (dest->y_height >> 5) * 16; + dstoffset = dest->y_stride * (dest->y_height >> 5) * 16; + + src += srcoffset; + dst += dstoffset; + + // Loop through the Y plane raw and reconstruction data summing (square differences) + for (i = 0; i < linestocopy; i += 16) + { + for (j = 0; j < source->y_width; j += 16) + { + unsigned int sse; + Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + } + + src += 16 * source->y_stride; + dst += 16 * dest->y_stride; + } + + return Total; +} + +extern void vp8_loop_filter_partial_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl, + int Fraction +); + +// Enforce a minimum filter level based upon baseline Q +static int get_min_filter_level(VP8_COMP *cpi, int base_qindex) +{ + int min_filter_level; + + if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame) + min_filter_level = 0; + else + { + if (base_qindex <= 6) + min_filter_level = 0; + else if (base_qindex <= 16) + min_filter_level = 1; + else + min_filter_level = (base_qindex / 8); + } + + return min_filter_level; +} + +// Enforce a maximum filter level based upon baseline Q +static int get_max_filter_level(VP8_COMP *cpi, int base_qindex) +{ + // PGW August 2006: Highest filter values almost always a bad idea + + // jbb chg: 20100118 - not so any more with this overquant stuff allow high values + // with lots of intra coming in. + int max_filter_level = MAX_LOOP_FILTER ;//* 3 / 4; + + if (cpi->section_intra_rating > 8) + max_filter_level = MAX_LOOP_FILTER * 3 / 4; + + (void) cpi; + (void) base_qindex; + + return max_filter_level; +} + +void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + int best_err = 0; + int filt_err = 0; + int min_filter_level = 0; + int max_filter_level = MAX_LOOP_FILTER * 3 / 4; // PGW August 2006: Highest filter values almost always a bad idea + int filt_val; + int best_filt_val = cm->filter_level; + + // Make a copy of the unfiltered / processed recon buffer + //vp8_yv12_copy_frame_ptr( cm->frame_to_show, &cpi->last_frame_uf ); + vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3); + + if (cm->frame_type == KEY_FRAME) + cm->sharpness_level = 0; + else + cm->sharpness_level = cpi->oxcf.Sharpness; + + // Enforce a minimum filter level based upon Q + min_filter_level = get_min_filter_level(cpi, cm->base_qindex); + max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + + // Start the search at the previous frame filter level unless it is now out of range. + if (cm->filter_level < min_filter_level) + cm->filter_level = min_filter_level; + else if (cm->filter_level > max_filter_level) + cm->filter_level = max_filter_level; + + filt_val = cm->filter_level; + best_filt_val = filt_val; + + // Set up alternate filter values + + // Get the err using the previous frame's filter value. + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0 , 3); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + + // Re-instate the unfiltered frame + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); + + filt_val -= (1 + ((filt_val > 10) ? 1 : 0)); + + // Search lower filter levels + while (filt_val >= min_filter_level) + { + // Apply the loop filter + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + // Get the err for filtered frame + filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + + + // Re-instate the unfiltered frame + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); + + + // Update the best case record or exit loop. + if (filt_err < best_err) + { + best_err = filt_err; + best_filt_val = filt_val; + } + else + break; + + // Adjust filter level + filt_val -= (1 + ((filt_val > 10) ? 1 : 0)); + } + + // Search up (note that we have already done filt_val = cm->filter_level) + filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0)); + + if (best_filt_val == cm->filter_level) + { + // Resist raising filter level for very small gains + best_err -= (best_err >> 10); + + while (filt_val < max_filter_level) + { + // Apply the loop filter + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + // Get the err for filtered frame + filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + + // Re-instate the unfiltered frame + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); + + // Update the best case record or exit loop. + if (filt_err < best_err) + { + // Do not raise filter level if improvement is < 1 part in 4096 + best_err = filt_err - (filt_err >> 10); + + best_filt_val = filt_val; + } + else + break; + + // Adjust filter level + filt_val += (1 + ((filt_val > 10) ? 1 : 0)); + } + } + + cm->filter_level = best_filt_val; + + if (cm->filter_level < min_filter_level) + cm->filter_level = min_filter_level; + + if (cm->filter_level > max_filter_level) + cm->filter_level = max_filter_level; +} + +// Stub function for now Alt LF not used +void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val) +{ + MACROBLOCKD *mbd = &cpi->mb.e_mbd; + (void) filt_val; + + mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0]; + mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1]; + mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2]; + mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3]; +} + +void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + int best_err = 0; + int filt_err = 0; + int min_filter_level; + int max_filter_level; + int prediction_difference = (int)(100 * abs((int)(cpi->last_auto_filter_prediction_error - cpi->prediction_error)) / (1 + cpi->prediction_error)); + + int filter_step; + int filt_high = 0; + int filt_mid = cm->filter_level; // Start search at previous frame filter level + int filt_low = 0; + int filt_best; + int filt_direction = 0; + + int Bias = 0; // Bias against raising loop filter and in favour of lowering it + + // Make a copy of the unfiltered / processed recon buffer +#if HAVE_ARMV7 + vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf); +#else + vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf); +#endif + + if (cm->frame_type == KEY_FRAME) + cm->sharpness_level = 0; + else + cm->sharpness_level = cpi->oxcf.Sharpness; + + // Enforce a minimum filter level based upon Q + min_filter_level = get_min_filter_level(cpi, cm->base_qindex); + max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + + // Start the search at the previous frame filter level unless it is now out of range. + filt_mid = cm->filter_level; + + if (filt_mid < min_filter_level) + filt_mid = min_filter_level; + else if (filt_mid > max_filter_level) + filt_mid = max_filter_level; + + // Define the initial step size + filter_step = (filt_mid < 16) ? 4 : filt_mid / 4; + + // Get baseline error score + vp8cx_set_alt_lf_level(cpi, filt_mid); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid, 0); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); + filt_best = filt_mid; + + // Re-instate the unfiltered frame +#if HAVE_ARMV7 + vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); +#else + vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show); +#endif + + while (filter_step > 0) + { + Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; //PGW change 12/12/06 for small images + + // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value + if (cpi->section_intra_rating < 20) + Bias = Bias * cpi->section_intra_rating / 20; + + filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step); + filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step); + + if ((filt_direction <= 0) && (filt_low != filt_mid)) + { + // Get Low filter error score + vp8cx_set_alt_lf_level(cpi, filt_low); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low, 0); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); + + // Re-instate the unfiltered frame +#if HAVE_ARMV7 + vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); +#else + vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show); +#endif + + // If value is close to the best so far then bias towards a lower loop filter value. + if ((filt_err - Bias) < best_err) + { + // Was it actually better than the previous best? + if (filt_err < best_err) + best_err = filt_err; + + filt_best = filt_low; + } + } + + // Now look at filt_high + if ((filt_direction >= 0) && (filt_high != filt_mid)) + { + vp8cx_set_alt_lf_level(cpi, filt_high); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high, 0); + cm->last_frame_type = cm->frame_type; + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); + + // Re-instate the unfiltered frame +#if HAVE_ARMV7 + vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); +#else + vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show); +#endif + + // Was it better than the previous best? + if (filt_err < (best_err - Bias)) + { + best_err = filt_err; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) + { + filter_step = filter_step / 2; + filt_direction = 0; + } + else + { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + cm->filter_level = filt_best; + cpi->last_auto_filt_val = filt_best; + cpi->last_auto_filt_q = cm->base_qindex; + + cpi->last_auto_filter_prediction_error = cpi->prediction_error; + cpi->frames_since_auto_filter = 0; +} diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c new file mode 100644 index 000000000..f99277f99 --- /dev/null +++ b/vp8/encoder/ppc/csystemdependent.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "onyx_int.h" + +SADFunction *vp8_sad16x16; +SADFunction *vp8_sad16x8; +SADFunction *vp8_sad8x16; +SADFunction *vp8_sad8x8; +SADFunction *vp8_sad4x4; + +variance_function *vp8_variance4x4; +variance_function *vp8_variance8x8; +variance_function *vp8_variance8x16; +variance_function *vp8_variance16x8; +variance_function *vp8_variance16x16; + +variance_function *vp8_mse16x16; + +sub_pixel_variance_function *vp8_sub_pixel_variance4x4; +sub_pixel_variance_function *vp8_sub_pixel_variance8x8; +sub_pixel_variance_function *vp8_sub_pixel_variance8x16; +sub_pixel_variance_function *vp8_sub_pixel_variance16x8; +sub_pixel_variance_function *vp8_sub_pixel_variance16x16; + +int (*vp8_block_error)(short *coeff, short *dqcoeff); +int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc); + +int (*vp8_mbuverror)(MACROBLOCK *mb); +unsigned int (*vp8_get_mb_ss)(short *); +void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); +void (*short_walsh4x4)(short *input, short *output, int pitch); + +void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); +void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); +void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); + +unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// c imports +extern int block_error_c(short *coeff, short *dqcoeff); +extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc); + +extern int vp8_mbuverror_c(MACROBLOCK *mb); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern void short_fdct4x4_c(short *input, short *output, int pitch); +extern void short_fdct8x4_c(short *input, short *output, int pitch); +extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch); + +extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); +extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); +extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); + +extern SADFunction sad16x16_c; +extern SADFunction sad16x8_c; +extern SADFunction sad8x16_c; +extern SADFunction sad8x8_c; +extern SADFunction sad4x4_c; + +extern variance_function variance16x16_c; +extern variance_function variance8x16_c; +extern variance_function variance16x8_c; +extern variance_function variance8x8_c; +extern variance_function variance4x4_c; +extern variance_function mse16x16_c; + +extern sub_pixel_variance_function sub_pixel_variance4x4_c; +extern sub_pixel_variance_function sub_pixel_variance8x8_c; +extern sub_pixel_variance_function sub_pixel_variance8x16_c; +extern sub_pixel_variance_function sub_pixel_variance16x8_c; +extern sub_pixel_variance_function sub_pixel_variance16x16_c; + +extern unsigned int vp8_get_mb_ss_c(short *); +extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// ppc +extern int vp8_block_error_ppc(short *coeff, short *dqcoeff); + +extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch); +extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch); + +extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride); +extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); + +extern SADFunction vp8_sad16x16_ppc; +extern SADFunction vp8_sad16x8_ppc; +extern SADFunction vp8_sad8x16_ppc; +extern SADFunction vp8_sad8x8_ppc; +extern SADFunction vp8_sad4x4_ppc; + +extern variance_function vp8_variance16x16_ppc; +extern variance_function vp8_variance8x16_ppc; +extern variance_function vp8_variance16x8_ppc; +extern variance_function vp8_variance8x8_ppc; +extern variance_function vp8_variance4x4_ppc; +extern variance_function vp8_mse16x16_ppc; + +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc; + +extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); + +void vp8_cmachine_specific_config(void) +{ + // Pure C: + vp8_mbuverror = vp8_mbuverror_c; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc; + vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc; + vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc; + short_walsh4x4 = vp8_short_walsh4x4_c; + + vp8_variance4x4 = vp8_variance4x4_ppc; + vp8_variance8x8 = vp8_variance8x8_ppc; + vp8_variance8x16 = vp8_variance8x16_ppc; + vp8_variance16x8 = vp8_variance16x8_ppc; + vp8_variance16x16 = vp8_variance16x16_ppc; + vp8_mse16x16 = vp8_mse16x16_ppc; + + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc; + + vp8_get_mb_ss = vp8_get_mb_ss_c; + vp8_get16x16pred_error = vp8_get16x16pred_error_c; + vp8_get8x8var = vp8_get8x8var_ppc; + vp8_get16x16var = vp8_get16x16var_ppc; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + + vp8_sad16x16 = vp8_sad16x16_ppc; + vp8_sad16x8 = vp8_sad16x8_ppc; + vp8_sad8x16 = vp8_sad8x16_ppc; + vp8_sad8x8 = vp8_sad8x8_ppc; + vp8_sad4x4 = vp8_sad4x4_ppc; + + vp8_block_error = vp8_block_error_ppc; + vp8_mbblock_error = vp8_mbblock_error_c; + + vp8_subtract_b = vp8_subtract_b_c; + vp8_subtract_mby = vp8_subtract_mby_ppc; + vp8_subtract_mbuv = vp8_subtract_mbuv_ppc; +} diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm new file mode 100644 index 000000000..e0e976d71 --- /dev/null +++ b/vp8/encoder/ppc/encodemb_altivec.asm @@ -0,0 +1,152 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_subtract_mbuv_ppc + .globl vp8_subtract_mby_ppc + +;# r3 short *diff +;# r4 unsigned char *usrc +;# r5 unsigned char *vsrc +;# r6 unsigned char *pred +;# r7 int stride +vp8_subtract_mbuv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf000 + mtspr 256, r12 ;# set VRSAVE + + li r9, 256 + add r3, r3, r9 + add r3, r3, r9 + add r6, r6, r9 + + li r10, 16 + li r9, 4 + mtctr r9 + + vspltisw v0, 0 + +mbu_loop: + lvsl v5, 0, r4 ;# permutate value for alignment + lvx v1, 0, r4 ;# src + lvx v2, 0, r6 ;# pred + + add r4, r4, r7 + addi r6, r6, 16 + + vperm v1, v1, v0, v5 + + vmrghb v3, v0, v1 ;# unpack high src to short + vmrghb v4, v0, v2 ;# unpack high pred to short + + lvsl v5, 0, r4 ;# permutate value for alignment + lvx v1, 0, r4 ;# src + + add r4, r4, r7 + + vsubshs v3, v3, v4 + + stvx v3, 0, r3 ;# store out diff + + vperm v1, v1, v0, v5 + + vmrghb v3, v0, v1 ;# unpack high src to short + vmrglb v4, v0, v2 ;# unpack high pred to short + + vsubshs v3, v3, v4 + + stvx v3, r10, r3 ;# store out diff + + addi r3, r3, 32 + + bdnz mbu_loop + + mtctr r9 + +mbv_loop: + lvsl v5, 0, r5 ;# permutate value for alignment + lvx v1, 0, r5 ;# src + lvx v2, 0, r6 ;# pred + + add r5, r5, r7 + addi r6, r6, 16 + + vperm v1, v1, v0, v5 + + vmrghb v3, v0, v1 ;# unpack high src to short + vmrghb v4, v0, v2 ;# unpack high pred to short + + lvsl v5, 0, r5 ;# permutate value for alignment + lvx v1, 0, r5 ;# src + + add r5, r5, r7 + + vsubshs v3, v3, v4 + + stvx v3, 0, r3 ;# store out diff + + vperm v1, v1, v0, v5 + + vmrghb v3, v0, v1 ;# unpack high src to short + vmrglb v4, v0, v2 ;# unpack high pred to short + + vsubshs v3, v3, v4 + + stvx v3, r10, r3 ;# store out diff + + addi r3, r3, 32 + + bdnz mbv_loop + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# r3 short *diff +;# r4 unsigned char *src +;# r5 unsigned char *pred +;# r6 int stride +vp8_subtract_mby_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf800 + mtspr 256, r12 ;# set VRSAVE + + li r10, 16 + mtctr r10 + + vspltisw v0, 0 + +mby_loop: + lvx v1, 0, r4 ;# src + lvx v2, 0, r5 ;# pred + + add r4, r4, r6 + addi r5, r5, 16 + + vmrghb v3, v0, v1 ;# unpack high src to short + vmrghb v4, v0, v2 ;# unpack high pred to short + + vsubshs v3, v3, v4 + + stvx v3, 0, r3 ;# store out diff + + vmrglb v3, v0, v1 ;# unpack low src to short + vmrglb v4, v0, v2 ;# unpack low pred to short + + vsubshs v3, v3, v4 + + stvx v3, r10, r3 ;# store out diff + + addi r3, r3, 32 + + bdnz mby_loop + + mtspr 256, r11 ;# reset old VRSAVE + + blr diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm new file mode 100644 index 000000000..eaab14c79 --- /dev/null +++ b/vp8/encoder/ppc/fdct_altivec.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_short_fdct4x4_ppc + .globl vp8_short_fdct8x4_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +;# Forward and inverse DCTs are nearly identical; only differences are +;# in normalization (fwd is twice unitary, inv is half unitary) +;# and that they are of course transposes of each other. +;# +;# The following three accomplish most of implementation and +;# are used only by ppc_idct.c and ppc_fdct.c. +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfffc + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + li r6, 16 + + load_c v0, dct_tab, 0, r9, r10 + lvx v1, r6, r10 + addi r10, r10, 32 + lvx v2, 0, r10 + lvx v3, r6, r10 + + load_c v4, ppc_dctperm_tab, 0, r9, r10 + load_c v5, ppc_dctperm_tab, r6, r9, r10 + + load_c v6, round_tab, 0, r10, r9 +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. +;# a/A are the even rows 0,2 b/B are the odd rows 1,3 +;# For fwd transform, indices are horizontal positions, then frequencies. +;# For inverse transform, frequencies then positions. +;# The two resulting A0..A3 B0..B3 are later combined +;# and vertically transformed. + +.macro two_rows_horiz Dst + vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 + + vmsumshm v10, v0, v8, v6 + vmsumshm v10, v1, v9, v10 + vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 + + vmsumshm v11, v2, v8, v6 + vmsumshm v11, v3, v9, v11 + vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 + + vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 + vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 +.endm + +;# Vertical xf on two rows. DCT values in comments are for inverse transform; +;# forward transform uses transpose. + +.macro two_rows_vert Ceven, Codd + vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times + vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" + vmsumshm v8, v8, v12, v6 + vmsumshm v8, v9, v13, v8 + vsraw v10, v8, v7 + + vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 + vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 + vmsumshm v8, v8, v12, v6 + vmsumshm v8, v9, v13, v8 + vsraw v8, v8, v7 + + vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 +.endm + +.macro two_rows_h Dest + stw r0, 0(r8) + lwz r0, 4(r3) + stw r0, 4(r8) + lwzux r0, r3,r5 + stw r0, 8(r8) + lwz r0, 4(r3) + stw r0, 12(r8) + lvx v8, 0,r8 + two_rows_horiz \Dest +.endm + + .align 2 +;# r3 short *input +;# r4 short *output +;# r5 int pitch +vp8_short_fdct4x4_ppc: + + prologue + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + epilogue + + blr + + .align 2 +;# r3 short *input +;# r4 short *output +;# r5 int pitch +vp8_short_fdct8x4_ppc: + prologue + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + addi r10, r3, 0 + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + ;# Next block + addi r3, r10, 8 + addi r4, r4, 32 + lvx v6, 0, r9 ;# v6 = Hround + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + epilogue + + blr + + .data + .align 4 +ppc_dctperm_tab: + .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 + .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 + + .align 4 +dct_tab: + .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 + .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 + + .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 + .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 + + .align 4 +round_tab: + .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) + .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm new file mode 100644 index 000000000..917bfe036 --- /dev/null +++ b/vp8/encoder/ppc/rdopt_altivec.asm @@ -0,0 +1,50 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_block_error_ppc + + .align 2 +;# r3 short *Coeff +;# r4 short *dqcoeff +vp8_block_error_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf800 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + stw r5, 12(r1) ;# tranfer dc to vector register + + lvx v0, 0, r3 ;# Coeff + lvx v1, 0, r4 ;# dqcoeff + + li r10, 16 + + vspltisw v3, 0 + + vsubshs v0, v0, v1 + + vmsumshm v2, v0, v0, v3 ;# multiply differences + + lvx v0, r10, r3 ;# Coeff + lvx v1, r10, r4 ;# dqcoeff + + vsubshs v0, v0, v1 + + vmsumshm v1, v0, v0, v2 ;# multiply differences + vsumsws v1, v1, v3 ;# sum up + + stvx v1, 0, r1 + lwz r3, 12(r1) ;# return value + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr diff --git a/vp8/encoder/ppc/sad_altivec.asm b/vp8/encoder/ppc/sad_altivec.asm new file mode 100644 index 000000000..1102ccf17 --- /dev/null +++ b/vp8/encoder/ppc/sad_altivec.asm @@ -0,0 +1,276 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_sad16x16_ppc + .globl vp8_sad16x8_ppc + .globl vp8_sad8x16_ppc + .globl vp8_sad8x8_ppc + .globl vp8_sad4x4_ppc + +.macro load_aligned_16 V R O + lvsl v3, 0, \R ;# permutate value for alignment + + lvx v1, 0, \R + lvx v2, \O, \R + + vperm \V, v1, v2, v3 +.endm + +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + li r10, 16 ;# load offset and loop counter + + vspltisw v8, 0 ;# zero out total to start +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +.macro SAD_16 + ;# v6 = abs (v4 - v5) + vsububs v6, v4, v5 + vsububs v7, v5, v4 + vor v6, v6, v7 + + ;# v8 += abs (v4 - v5) + vsum4ubs v8, v6, v8 +.endm + +.macro sad_16_loop loop_label + lvsl v3, 0, r5 ;# only needs to be done once per block + + ;# preload a line of data before getting into the loop + lvx v4, 0, r3 + lvx v1, 0, r5 + lvx v2, r10, r5 + + add r5, r5, r6 + add r3, r3, r4 + + vperm v5, v1, v2, v3 + + .align 4 +\loop_label: + ;# compute difference on first row + vsububs v6, v4, v5 + vsububs v7, v5, v4 + + ;# load up next set of data + lvx v9, 0, r3 + lvx v1, 0, r5 + lvx v2, r10, r5 + + ;# perform abs() of difference + vor v6, v6, v7 + add r3, r3, r4 + + ;# add to the running tally + vsum4ubs v8, v6, v8 + + ;# now onto the next line + vperm v5, v1, v2, v3 + add r5, r5, r6 + lvx v4, 0, r3 + + ;# compute difference on second row + vsububs v6, v9, v5 + lvx v1, 0, r5 + vsububs v7, v5, v9 + lvx v2, r10, r5 + vor v6, v6, v7 + add r3, r3, r4 + vsum4ubs v8, v6, v8 + vperm v5, v1, v2, v3 + add r5, r5, r6 + + bdnz \loop_label + + vspltisw v7, 0 + + vsumsws v8, v8, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) +.endm + +.macro sad_8_loop loop_label + .align 4 +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# only one of the inputs should need to be aligned. + load_aligned_16 v6, r3, r10 + load_aligned_16 v7, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + vmrghb v4, v4, v6 + vmrghb v5, v5, v7 + + SAD_16 + + bdnz \loop_label + + vspltisw v7, 0 + + vsumsws v8, v8, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad16x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + sad_16_loop sad16x16_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad16x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + sad_16_loop sad16x8_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad8x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + sad_8_loop sad8x16_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad8x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + sad_8_loop sad8x8_loop + + epilogue + + blr + +.macro transfer_4x4 I P + lwz r0, 0(\I) + add \I, \I, \P + + lwz r7, 0(\I) + add \I, \I, \P + + lwz r8, 0(\I) + add \I, \I, \P + + lwz r9, 0(\I) + + stw r0, 0(r1) + stw r7, 4(r1) + stw r8, 8(r1) + stw r9, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad4x4_ppc: + + prologue + + transfer_4x4 r3, r4 + lvx v4, 0, r1 + + transfer_4x4 r5, r6 + lvx v5, 0, r1 + + vspltisw v8, 0 ;# zero out total to start + + ;# v6 = abs (v4 - v5) + vsububs v6, v4, v5 + vsububs v7, v5, v4 + vor v6, v6, v7 + + ;# v8 += abs (v4 - v5) + vsum4ubs v7, v6, v8 + vsumsws v7, v7, v8 + + stvx v7, 0, r1 + lwz r3, 12(r1) + + epilogue + + blr diff --git a/vp8/encoder/ppc/variance_altivec.asm b/vp8/encoder/ppc/variance_altivec.asm new file mode 100644 index 000000000..952bf7286 --- /dev/null +++ b/vp8/encoder/ppc/variance_altivec.asm @@ -0,0 +1,374 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_get8x8var_ppc + .globl vp8_get16x16var_ppc + .globl vp8_mse16x16_ppc + .globl vp8_variance16x16_ppc + .globl vp8_variance16x8_ppc + .globl vp8_variance8x16_ppc + .globl vp8_variance8x8_ppc + .globl vp8_variance4x4_ppc + +.macro load_aligned_16 V R O + lvsl v3, 0, \R ;# permutate value for alignment + + lvx v1, 0, \R + lvx v2, \O, \R + + vperm \V, v1, v2, v3 +.endm + +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + li r10, 16 ;# load offset and loop counter + + vspltisw v7, 0 ;# zero for merging + vspltisw v8, 0 ;# zero out total to start + vspltisw v9, 0 ;# zero out total for dif^2 +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +.macro compute_sum_sse + ;# Compute sum first. Unpack to so signed subract + ;# can be used. Only have a half word signed + ;# subract. Do high, then low. + vmrghb v2, v7, v4 + vmrghb v3, v7, v5 + vsubshs v2, v2, v3 + vsum4shs v8, v2, v8 + + vmrglb v2, v7, v4 + vmrglb v3, v7, v5 + vsubshs v2, v2, v3 + vsum4shs v8, v2, v8 + + ;# Now compute sse. + vsububs v2, v4, v5 + vsububs v3, v5, v4 + vor v2, v2, v3 + + vmsumubm v9, v2, v2, v9 +.endm + +.macro variance_16 DS loop_label store_sum +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + compute_sum_sse + + bdnz \loop_label + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + +.if \store_sum + stw r3, 0(r8) ;# sum +.endif + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srawi r3, r3, \DS ;# (sum*sum) >> DS + subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) +.endm + +.macro variance_8 DS loop_label store_sum +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# only one of the inputs should need to be aligned. + load_aligned_16 v6, r3, r10 + load_aligned_16 v0, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + vmrghb v4, v4, v6 + vmrghb v5, v5, v0 + + compute_sum_sse + + bdnz \loop_label + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + +.if \store_sum + stw r3, 0(r8) ;# sum +.endif + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srawi r3, r3, \DS ;# (sum*sum) >> 8 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *SSE +;# r8 int *Sum +;# +;# r3 return value +vp8_get8x8var_ppc: + + prologue + + li r9, 4 + mtctr r9 + + variance_8 6, get8x8var_loop, 1 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *SSE +;# r8 int *Sum +;# +;# r3 return value +vp8_get16x16var_ppc: + + prologue + + mtctr r10 + + variance_16 8, get16x16var_loop, 1 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r 3 return value +vp8_mse16x16_ppc: + prologue + + mtctr r10 + +mse16x16_loop: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# Now compute sse. + vsububs v2, v4, v5 + vsububs v3, v5, v4 + vor v2, v2, v3 + + vmsumubm v9, v2, v2, v9 + + bdnz mse16x16_loop + + vsumsws v9, v9, v7 + + stvx v9, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r3, 12(r1) + + stw r3, 0(r7) ;# sse + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance16x16_ppc: + + prologue + + mtctr r10 + + variance_16 8, variance16x16_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance16x8_ppc: + + prologue + + li r9, 8 + mtctr r9 + + variance_16 7, variance16x8_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance8x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + variance_8 7, variance8x16_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance8x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + variance_8 6, variance8x8_loop, 0 + + epilogue + + blr + +.macro transfer_4x4 I P + lwz r0, 0(\I) + add \I, \I, \P + + lwz r10,0(\I) + add \I, \I, \P + + lwz r8, 0(\I) + add \I, \I, \P + + lwz r9, 0(\I) + + stw r0, 0(r1) + stw r10, 4(r1) + stw r8, 8(r1) + stw r9, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance4x4_ppc: + + prologue + + transfer_4x4 r3, r4 + lvx v4, 0, r1 + + transfer_4x4 r5, r6 + lvx v5, 0, r1 + + compute_sum_sse + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srawi r3, r3, 4 ;# (sum*sum) >> 4 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) + + epilogue + + blr diff --git a/vp8/encoder/ppc/variance_subpixel_altivec.asm b/vp8/encoder/ppc/variance_subpixel_altivec.asm new file mode 100644 index 000000000..148a8d25b --- /dev/null +++ b/vp8/encoder/ppc/variance_subpixel_altivec.asm @@ -0,0 +1,864 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_sub_pixel_variance4x4_ppc + .globl vp8_sub_pixel_variance8x8_ppc + .globl vp8_sub_pixel_variance8x16_ppc + .globl vp8_sub_pixel_variance16x8_ppc + .globl vp8_sub_pixel_variance16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_vfilter V0, V1 + load_c \V0, vfilter_b, r6, r12, r10 + + addi r6, r6, 16 + lvx \V1, r6, r10 +.endm + +.macro HProlog jump_label + ;# load up horizontal filter + slwi. r5, r5, 4 ;# index into horizontal filter array + + ;# index to the next set of vectors in the row. + li r10, 16 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq \jump_label + + load_c v20, hfilter_b, r5, r12, r0 + + ;# setup constants + ;# v14 permutation value for alignment + load_c v28, b_hperm_b, 0, r12, r0 + + ;# index to the next set of vectors in the row. + li r12, 32 + + ;# rounding added in on the multiply + vspltisw v21, 8 + vspltisw v18, 3 + vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 + + slwi. r6, r6, 5 ;# index into vertical filter array +.endm + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# + +.macro hfilter_8 V, hp, lp, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 9 bytes wide, output is 8 bytes. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + + vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 + vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A + + vmsummbm v24, v20, v24, v18 + vmsummbm v25, v20, v25, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + + vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result +.endm + +.macro vfilter_16 P0 P1 + vmuleub v22, \P0, v20 ;# 64 + 4 positive taps + vadduhm v22, v18, v22 + vmuloub v23, \P0, v20 + vadduhm v23, v18, v23 + + vmuleub v24, \P1, v21 + vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary + vmuloub v25, \P1, v21 + vadduhm v23, v23, v25 ;# Ro = odds + + vsrh v22, v22, v19 ;# divide by 128 + vsrh v23, v23, v19 ;# v16 v17 = evens, odds + vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order + vmrglh v23, v22, v23 + vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result +.endm + +.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 + ;# Compute sum first. Unpack to so signed subract + ;# can be used. Only have a half word signed + ;# subract. Do high, then low. + vmrghb \t1, \z0, \src + vmrghb \t2, \z0, \ref + vsubshs \t1, \t1, \t2 + vsum4shs \sum, \t1, \sum + + vmrglb \t1, \z0, \src + vmrglb \t2, \z0, \ref + vsubshs \t1, \t1, \t2 + vsum4shs \sum, \t1, \sum + + ;# Now compute sse. + vsububs \t1, \src, \ref + vsububs \t2, \ref, \src + vor \t1, \t1, \t2 + + vmsumubm \sse, \t1, \t1, \sse +.endm + +.macro variance_final sum, sse, z0, DS + vsumsws \sum, \sum, \z0 + vsumsws \sse, \sse, \z0 + + stvx \sum, 0, r1 + lwz r3, 12(r1) + + stvx \sse, 0, r1 + lwz r4, 12(r1) + + stw r4, 0(r9) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srawi r3, r3, \DS ;# (sum*sum) >> 8 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) +.endm + +.macro compute_sum_sse_16 V, increment_counter + load_and_align_16 v16, r7, r8, \increment_counter + compute_sum_sse \V, v16, v18, v19, v20, v21, v23 +.endm + +.macro load_and_align_16 V, R, P, increment_counter + lvsl v17, 0, \R ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, \R + lvx v22, r10, \R + +.if \increment_counter + add \R, \R, \P +.endif + + vperm \V, v21, v22, v17 +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance4x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_4x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r12, r0 + load_c v11, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v10, v11, 1 + hfilter_8 v1, v10, v11, 1 + hfilter_8 v2, v10, v11, 1 + hfilter_8 v3, v10, v11, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_4x4_b + + hfilter_8 v4, v10, v11, 0 + + b second_pass_4x4_b + +second_pass_4x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 0 + +second_pass_4x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +compute_sum_sse_4x4_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + load_and_align_16 v4, r7, r8, 1 + load_and_align_16 v5, r7, r8, 1 + load_and_align_16 v6, r7, r8, 1 + load_and_align_16 v7, r7, r8, 1 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + + load_c v10, b_hilo_b, 0, r12, r0 + + vperm v0, v0, v1, v10 + vperm v1, v2, v3, v10 + + compute_sum_sse v0, v1, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 4 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff0 + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x8_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r12, r0 + load_c v11, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v10, v11, 1 + hfilter_8 v1, v10, v11, 1 + hfilter_8 v2, v10, v11, 1 + hfilter_8 v3, v10, v11, 1 + hfilter_8 v4, v10, v11, 1 + hfilter_8 v5, v10, v11, 1 + hfilter_8 v6, v10, v11, 1 + hfilter_8 v7, v10, v11, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_8x8_b + + hfilter_8 v8, v10, v11, 0 + + b second_pass_8x8_b + +second_pass_8x8_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 0 + + beq compute_sum_sse_8x8_b + +second_pass_8x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +compute_sum_sse_8x8_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + + load_and_align_16 v4, r7, r8, 1 + load_and_align_16 v5, r7, r8, 1 + load_and_align_16 v6, r7, r8, 1 + load_and_align_16 v7, r7, r8, 1 + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 0 + + vmrghb v4, v4, v5 + vmrghb v5, v6, v7 + vmrghb v6, v8, v9 + vmrghb v7, v10, v11 + + compute_sum_sse v0, v4, v18, v19, v20, v21, v23 + compute_sum_sse v1, v5, v18, v19, v20, v21, v23 + compute_sum_sse v2, v6, v18, v19, v20, v21, v23 + compute_sum_sse v3, v7, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 6 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance8x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfffc + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x16_pre_copy_b + + ;# Load up permutation constants + load_c v29, b_0123_b, 0, r12, r0 + load_c v30, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v29, v30, 1 + hfilter_8 v1, v29, v30, 1 + hfilter_8 v2, v29, v30, 1 + hfilter_8 v3, v29, v30, 1 + hfilter_8 v4, v29, v30, 1 + hfilter_8 v5, v29, v30, 1 + hfilter_8 v6, v29, v30, 1 + hfilter_8 v7, v29, v30, 1 + hfilter_8 v8, v29, v30, 1 + hfilter_8 v9, v29, v30, 1 + hfilter_8 v10, v29, v30, 1 + hfilter_8 v11, v29, v30, 1 + hfilter_8 v12, v29, v30, 1 + hfilter_8 v13, v29, v30, 1 + hfilter_8 v14, v29, v30, 1 + hfilter_8 v15, v29, v30, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_8x16_b + + hfilter_8 v16, v29, v30, 0 + + b second_pass_8x16_b + +second_pass_8x16_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + load_and_align_16 v9, r3, r4, 1 + load_and_align_16 v10, r3, r4, 1 + load_and_align_16 v11, r3, r4, 1 + load_and_align_16 v12, r3, r4, 1 + load_and_align_16 v13, r3, r4, 1 + load_and_align_16 v14, r3, r4, 1 + load_and_align_16 v15, r3, r4, 1 + load_and_align_16 v16, r3, r4, 0 + + beq compute_sum_sse_8x16_b + +second_pass_8x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +compute_sum_sse_8x16_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + vmrghb v4, v8, v9 + vmrghb v5, v10, v11 + vmrghb v6, v12, v13 + vmrghb v7, v14, v15 + + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 1 + load_and_align_16 v12, r7, r8, 1 + load_and_align_16 v13, r7, r8, 1 + load_and_align_16 v14, r7, r8, 1 + load_and_align_16 v15, r7, r8, 1 + + vmrghb v8, v8, v9 + vmrghb v9, v10, v11 + vmrghb v10, v12, v13 + vmrghb v11, v14, v15 + + compute_sum_sse v0, v8, v18, v19, v20, v21, v23 + compute_sum_sse v1, v9, v18, v19, v20, v21, v23 + compute_sum_sse v2, v10, v18, v19, v20, v21, v23 + compute_sum_sse v3, v11, v18, v19, v20, v21, v23 + + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 1 + load_and_align_16 v12, r7, r8, 1 + load_and_align_16 v13, r7, r8, 1 + load_and_align_16 v14, r7, r8, 1 + load_and_align_16 v15, r7, r8, 0 + + vmrghb v8, v8, v9 + vmrghb v9, v10, v11 + vmrghb v10, v12, v13 + vmrghb v11, v14, v15 + + compute_sum_sse v4, v8, v18, v19, v20, v21, v23 + compute_sum_sse v5, v9, v18, v19, v20, v21, v23 + compute_sum_sse v6, v10, v18, v19, v20, v21, v23 + compute_sum_sse v7, v11, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 7 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + blr + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro hfilter_16 V, increment_counter + + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + lvx v23, r12, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified + + ;# set 0 + vmsummbm v24, v20, v21, v18 ;# taps times elements + + ;# set 1 + vsldoi v23, v21, v22, 1 + vmsummbm v25, v20, v23, v18 + + ;# set 2 + vsldoi v23, v21, v22, 2 + vmsummbm v26, v20, v23, v18 + + ;# set 3 + vsldoi v23, v21, v22, 3 + vmsummbm v27, v20, v23, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + vsrh v25, v25, v19 + + vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result + vperm \V, \V, v0, v28 ;# \V = correctly-ordered result +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance16x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + HProlog second_pass_16x8_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_16x8_b + + hfilter_16 v8, 0 + + b second_pass_16x8_b + +second_pass_16x8_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + + beq compute_sum_sse_16x8_b + +second_pass_16x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +compute_sum_sse_16x8_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + compute_sum_sse_16 v0, 1 + compute_sum_sse_16 v1, 1 + compute_sum_sse_16 v2, 1 + compute_sum_sse_16 v3, 1 + compute_sum_sse_16 v4, 1 + compute_sum_sse_16 v5, 1 + compute_sum_sse_16 v6, 1 + compute_sum_sse_16 v7, 0 + + variance_final v18, v19, v23, 7 + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + HProlog second_pass_16x16_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + hfilter_16 v8, 1 + hfilter_16 v9, 1 + hfilter_16 v10, 1 + hfilter_16 v11, 1 + hfilter_16 v12, 1 + hfilter_16 v13, 1 + hfilter_16 v14, 1 + hfilter_16 v15, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_16x16_b + + hfilter_16 v16, 0 + + b second_pass_16x16_b + +second_pass_16x16_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + load_and_align_16 v9, r3, r4, 1 + load_and_align_16 v10, r3, r4, 1 + load_and_align_16 v11, r3, r4, 1 + load_and_align_16 v12, r3, r4, 1 + load_and_align_16 v13, r3, r4, 1 + load_and_align_16 v14, r3, r4, 1 + load_and_align_16 v15, r3, r4, 1 + load_and_align_16 v16, r3, r4, 0 + + beq compute_sum_sse_16x16_b + +second_pass_16x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +compute_sum_sse_16x16_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + compute_sum_sse_16 v0, 1 + compute_sum_sse_16 v1, 1 + compute_sum_sse_16 v2, 1 + compute_sum_sse_16 v3, 1 + compute_sum_sse_16 v4, 1 + compute_sum_sse_16 v5, 1 + compute_sum_sse_16 v6, 1 + compute_sum_sse_16 v7, 1 + compute_sum_sse_16 v8, 1 + compute_sum_sse_16 v9, 1 + compute_sum_sse_16 v10, 1 + compute_sum_sse_16 v11, 1 + compute_sum_sse_16 v12, 1 + compute_sum_sse_16 v13, 1 + compute_sum_sse_16 v14, 1 + compute_sum_sse_16 v15, 0 + + variance_final v18, v19, v23, 8 + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +hfilter_b: + .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 + .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 + .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 + .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 + .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 + .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 + .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 + .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 + + .align 4 +vfilter_b: + .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + + .align 4 +b_hperm_b: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +b_0123_b: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +b_4567_b: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +b_hilo_b: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/encoder/preproc.c b/vp8/encoder/preproc.c new file mode 100644 index 000000000..d2a13dced --- /dev/null +++ b/vp8/encoder/preproc.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : preproc.c +* +* Description : Simple pre-processor. +* +****************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ + +#include "memory.h" +#include "preproc7.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* Macros +****************************************************************************/ +#define FRAMECOUNT 7 +#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) + +/**************************************************************************** +* Imports +****************************************************************************/ +extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +/**************************************************************************** +* Exported Global Variables +****************************************************************************/ +void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); +void temp_filter_mmx +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +); +void temp_filter_wmt +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +); + +/**************************************************************************** + * + * ROUTINE : temp_filter_c + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_c +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + if (ppi->frame == 0) + { + do + { + int frame = 0; + + do + { + *frameptr = s[byte]; + ++frameptr; + ++frame; + } + while (frame < FRAMECOUNT); + + d[byte] = s[byte]; + + ++byte; + } + while (byte < bytes); + } + else + { + int modifier; + int offset = (ppi->frame % FRAMECOUNT); + + do + { + int accumulator = 0; + int count = 0; + int frame = 0; + + frameptr[offset] = s[byte]; + + do + { + int pixel_value = *frameptr; + + modifier = s[byte]; + modifier -= pixel_value; + modifier *= modifier; + modifier >>= strength; + modifier *= 3; + + if (modifier > 16) + modifier = 16; + + modifier = 16 - modifier; + + accumulator += modifier * pixel_value; + + count += modifier; + + frameptr++; + + ++frame; + } + while (frame < FRAMECOUNT); + + accumulator += (count >> 1); + accumulator *= ppi->fixed_divide[count]; + accumulator >>= 16; + + d[byte] = accumulator; + + ++byte; + } + while (byte < bytes); + } + + ++ppi->frame; +} +/**************************************************************************** + * + * ROUTINE : delete_pre_proc + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Deletes a pre-processing instance. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void delete_pre_proc(pre_proc_instance *ppi) +{ + if (ppi->frame_buffer_alloc) + vpx_free(ppi->frame_buffer_alloc); + + ppi->frame_buffer_alloc = 0; + ppi->frame_buffer = 0; + + if (ppi->fixed_divide_alloc) + vpx_free(ppi->fixed_divide_alloc); + + ppi->fixed_divide_alloc = 0; + ppi->fixed_divide = 0; +} + +/**************************************************************************** + * + * ROUTINE : init_pre_proc + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * int frame_size : Number of bytes in one frame. + * + * OUTPUTS : None. + * + * RETURNS : int: 1 if successful, 0 if failed. + * + * FUNCTION : Initializes prepprocessor instance. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +int init_pre_proc7(pre_proc_instance *ppi, int frame_size) +{ + int i; + int mmx_enabled; + int xmm_enabled; + int wmt_enabled; + + vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); + + if (wmt_enabled) + temp_filter = temp_filter_wmt; + else if (mmx_enabled) + temp_filter = temp_filter_mmx; + else + temp_filter = temp_filter_c; + + + delete_pre_proc(ppi); + + ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char)); + + if (!ppi->frame_buffer_alloc) + { + delete_pre_proc(ppi); + return 0; + } + + ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc); + + ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int)); + + if (!ppi->fixed_divide_alloc) + { + delete_pre_proc(ppi); + return 0; + } + + ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc); + + for (i = 1; i < 255; i++) + ppi->fixed_divide[i] = 0x10000 / i; + + return 1; +} diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c new file mode 100644 index 000000000..0e34cecb1 --- /dev/null +++ b/vp8/encoder/psnr.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +#include "math.h" +#include "systemdependent.h" /* for vp8_clear_system_state() */ + +#define MAX_PSNR 60 + +double vp8_mse2psnr(double Samples, double Peak, double Mse) +{ + double psnr; + + if ((double)Mse > 0.0) + psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + else + psnr = MAX_PSNR; // Limit to prevent / 0 + + if (psnr > MAX_PSNR) + psnr = MAX_PSNR; + + return psnr; +} + +double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error) +{ + int i, j; + int Diff; + double frame_psnr; + double Total; + double grand_total; + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + + Total = 0.0; + grand_total = 0.0; + + // Loop throught the Y plane raw and reconstruction data summing (square differences) + for (i = 0; i < source->y_height; i++) + { + + for (j = 0; j < source->y_width; j++) + { + Diff = (int)(src[j]) - (int)(dst[j]); + Total += Diff * Diff; + } + + src += source->y_stride; + dst += dest->y_stride; + } + + // Work out Y PSNR + *YPsnr = vp8_mse2psnr(source->y_height * source->y_width, 255.0, Total); + grand_total += Total; + Total = 0; + + + // Loop through the U plane + src = source->u_buffer; + dst = dest->u_buffer; + + for (i = 0; i < source->uv_height; i++) + { + + for (j = 0; j < source->uv_width; j++) + { + Diff = (int)(src[j]) - (int)(dst[j]); + Total += Diff * Diff; + } + + src += source->uv_stride; + dst += dest->uv_stride; + } + + // Work out U PSNR + *UPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total); + grand_total += Total; + Total = 0; + + + // V PSNR + src = source->v_buffer; + dst = dest->v_buffer; + + for (i = 0; i < source->uv_height; i++) + { + + for (j = 0; j < source->uv_width; j++) + { + Diff = (int)(src[j]) - (int)(dst[j]); + Total += Diff * Diff; + } + + src += source->uv_stride; + dst += dest->uv_stride; + } + + // Work out UV PSNR + *VPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total); + grand_total += Total; + Total = 0; + + // Work out total PSNR + frame_psnr = vp8_mse2psnr(source->y_height * source->y_width * 3 / 2 , 255.0, grand_total); + + *sq_error = 1.0 * grand_total; + + return frame_psnr; +} diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h new file mode 100644 index 000000000..9f6ca0bbf --- /dev/null +++ b/vp8/encoder/psnr.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_PSNR_H +#define __INC_PSNR_H + +extern double vp8_mse2psnr(double Samples, double Peak, double Mse); +extern double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error); + +#endif diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c new file mode 100644 index 000000000..6028ebf56 --- /dev/null +++ b/vp8/encoder/quantize.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include "vpx_mem/vpx_mem.h" + +#include "quantize.h" +#include "entropy.h" +#include "predictdc.h" + +void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) +{ + int i, rc, eob; + int zbin; + int x, y, z, sz; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + + vpx_memset(qcoeff_ptr, 0, 32); + vpx_memset(dqcoeff_ptr, 0, 32); + + eob = -1; + + for (i = 0; i < 16; i++) + { + rc = vp8_default_zig_zag1d[i]; + z = coeff_ptr[rc]; + zbin = zbin_ptr[rc] ; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) + { + y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + + if (y) + { + eob = i; // last nonzero coeffs + } + } + } + + d->eob = eob + 1; + +} + +void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) +{ + int i, rc, eob; + int zbin; + int x, y, z, sz; + short *zbin_boost_ptr = &b->zrun_zbin_boost[0]; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + short zbin_oq_value = b->zbin_extra; + + vpx_memset(qcoeff_ptr, 0, 32); + vpx_memset(dqcoeff_ptr, 0, 32); + + eob = -1; + + for (i = 0; i < 16; i++) + { + rc = vp8_default_zig_zag1d[i]; + z = coeff_ptr[rc]; + + //if ( i == 0 ) + // zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2; + //else + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + + zbin_boost_ptr ++; + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) + { + y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + + if (y) + { + eob = i; // last nonzero coeffs + zbin_boost_ptr = &b->zrun_zbin_boost[0]; // reset zero runlength + } + } + } + + d->eob = eob + 1; +} +void vp8_quantize_mby(MACROBLOCK *x) +{ + int i; + + if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV) + { + for (i = 0; i < 16; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2); + } + + x->quantize_b(&x->block[24], &x->e_mbd.block[24]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob); + + } + else + { + for (i = 0; i < 16; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } +} + +void vp8_quantize_mb(MACROBLOCK *x) +{ + int i; + + x->e_mbd.mbmi.mb_skip_coeff = 1; + + if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV) + { + for (i = 0; i < 16; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2); + } + + for (i = 16; i < 25; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } + else + { + for (i = 0; i < 24; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } + +} + + +void vp8_quantize_mbuv(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i++) + { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } +} + +// This function is not currently called +void vp8_quantize_mbrd(MACROBLOCK *x) +{ + int i; + + x->e_mbd.mbmi.mb_skip_coeff = 1; + + if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV) + { + for (i = 0; i < 16; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2); + } + + for (i = 16; i < 25; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } + else + { + for (i = 0; i < 24; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } +} + +void vp8_quantize_mbuvrd(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } +} + +void vp8_quantize_mbyrd(MACROBLOCK *x) +{ + int i; + + if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV) + { + for (i = 0; i < 16; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2); + } + + x->quantize_brd(&x->block[24], &x->e_mbd.block[24]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob); + + } + else + { + for (i = 0; i < 16; i++) + { + x->quantize_brd(&x->block[i], &x->e_mbd.block[i]); + x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob); + } + } +} diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h new file mode 100644 index 000000000..868e8e3a8 --- /dev/null +++ b/vp8/encoder/quantize.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_QUANTIZE_H +#define __INC_QUANTIZE_H + +#include "block.h" + +#define prototype_quantize_block(sym) \ + void (sym)(BLOCK *b,BLOCKD *d) + +#if ARCH_ARM +#include "arm/quantize_arm.h" +#endif + +#ifndef vp8_quantize_quantb +#define vp8_quantize_quantb vp8_regular_quantize_b +#endif +extern prototype_quantize_block(vp8_quantize_quantb); + +#ifndef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_c +#endif +extern prototype_quantize_block(vp8_quantize_fastquantb); + +typedef struct +{ + prototype_quantize_block(*quantb); + prototype_quantize_block(*fastquantb); +} vp8_quantize_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn +#else +#define QUANTIZE_INVOKE(ctx,fn) vp8_quantize_##fn +#endif + +extern void vp8_quantize_mb(MACROBLOCK *x); +extern void vp8_quantize_mbuv(MACROBLOCK *x); +extern void vp8_quantize_mby(MACROBLOCK *x); +extern void vp8_quantize_mbyrd(MACROBLOCK *x); +extern void vp8_quantize_mbuvrd(MACROBLOCK *x); +extern void vp8_quantize_mbrd(MACROBLOCK *x); + +#endif diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c new file mode 100644 index 000000000..05040d310 --- /dev/null +++ b/vp8/encoder/ratectrl.c @@ -0,0 +1,1552 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <assert.h> + +#include "math.h" +#include "common.h" +#include "ratectrl.h" +#include "entropymode.h" +#include "vpx_mem/vpx_mem.h" +#include "systemdependent.h" +#include "encodemv.h" + + +#define MIN_BPB_FACTOR 0.01 +#define MAX_BPB_FACTOR 50 + +extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; +extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES]; + + + +#ifdef MODE_STATS +extern int y_modes[5]; +extern int uv_modes[4]; +extern int b_modes[10]; + +extern int inter_y_modes[10]; +extern int inter_uv_modes[4]; +extern int inter_b_modes[10]; +#endif + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +const int vp8_bits_per_mb[2][QINDEX_RANGE] = +{ + // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q: + { + 674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719, + 390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402, + 285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615, + 229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401, + 187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560, + 156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700, + 134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782, + 114883, 112332, 108410, 105685, 103434, 101192, 98587, 95959, + 94059, 92017, 89970, 87936, 86142, 84801, 82736, 81106, + 79668, 78135, 76641, 75103, 73943, 72693, 71401, 70098, + 69165, 67901, 67170, 65987, 64923, 63534, 62378, 61302, + 59921, 58941, 57844, 56782, 55960, 54973, 54257, 53454, + 52230, 50938, 49962, 49190, 48288, 47270, 46738, 46037, + 45020, 44027, 43216, 42287, 41594, 40702, 40081, 39414, + 38282, 37627, 36987, 36375, 35808, 35236, 34710, 34162, + 33659, 33327, 32751, 32384, 31936, 31461, 30982, 30582, + }, + + // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q: + { + 497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321, + 233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116, + 154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226, + 114188, 111756, 107882, 104749, 102522, 96451, 94424, 90905, + 87286, 84931, 82111, 80534, 77610, 74700, 73037, 70715, + 68006, 67235, 65374, 64009, 62134, 60180, 59105, 57691, + 55509, 54512, 53318, 52693, 51194, 49840, 48944, 46980, + 45668, 44177, 42348, 40994, 39859, 38889, 37717, 36391, + 35482, 34622, 33795, 32756, 32002, 31492, 30573, 29737, + 29152, 28514, 27941, 27356, 26859, 26329, 25874, 25364, + 24957, 24510, 24290, 23689, 23380, 22845, 22481, 22066, + 21587, 21219, 20880, 20452, 20260, 19926, 19661, 19334, + 18915, 18391, 18046, 17833, 17441, 17105, 16888, 16729, + 16383, 16023, 15706, 15442, 15222, 14938, 14673, 14452, + 14005, 13807, 13611, 13447, 13223, 13102, 12963, 12801, + 12627, 12534, 12356, 12228, 12056, 11907, 11746, 11643, + } +}; + +const int vp8_kf_boost_qadjustment[QINDEX_RANGE] = +{ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 200, 201, 201, 202, 203, 203, 203, + 204, 204, 205, 205, 206, 206, 207, 207, + 208, 208, 209, 209, 210, 210, 211, 211, + 212, 212, 213, 213, 214, 214, 215, 215, + 216, 216, 217, 217, 218, 218, 219, 219, + 220, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 220, 220, 220, 220, +}; + +//#define GFQ_ADJUSTMENT (Q+100) +#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q] +const int vp8_gf_boost_qadjustment[QINDEX_RANGE] = +{ + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 184, 185, 185, 186, 186, 187, 187, + 188, 188, 189, 189, 190, 190, 191, 191, + 192, 192, 193, 193, 194, 194, 194, 194, + 195, 195, 196, 196, 197, 197, 198, 198 +}; + +/* +const int vp8_gf_boost_qadjustment[QINDEX_RANGE] = +{ + 100,101,102,103,104,105,105,106, + 106,107,107,108,109,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,170,171,171,172,172, + 173,173,173,174,174,174,175,175, + 175,176,176,176,177,177,177,177, + 178,178,179,179,180,180,181,181, + 182,182,183,183,184,184,185,185, + 186,186,187,187,188,188,189,189, + 190,190,191,191,192,192,193,193, +}; +*/ + +const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] = +{ + 150, 155, 160, 165, 170, 175, 180, 185, + 190, 195, 200, 205, 210, 215, 220, 225, + 230, 235, 240, 245, 250, 255, 260, 265, + 270, 275, 280, 285, 290, 295, 300, 305, + 310, 320, 330, 340, 350, 360, 370, 380, + 390, 400, 410, 420, 430, 440, 450, 460, + 470, 480, 490, 500, 510, 520, 530, 540, + 550, 560, 570, 580, 590, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, +}; + +// % adjustment to target kf size based on seperation from previous frame +const int vp8_kf_boost_seperationt_adjustment[16] = +{ + 30, 40, 50, 55, 60, 65, 70, 75, + 80, 85, 90, 95, 100, 100, 100, 100, +}; + + +const int vp8_gf_adjust_table[101] = +{ + 100, + 115, 130, 145, 160, 175, 190, 200, 210, 220, 230, + 240, 260, 270, 280, 290, 300, 310, 320, 330, 340, + 350, 360, 370, 380, 390, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, +}; + +const int vp8_gf_intra_useage_adjustment[20] = +{ + 125, 120, 115, 110, 105, 100, 95, 85, 80, 75, + 70, 65, 60, 55, 50, 50, 50, 50, 50, 50, +}; + +const int vp8_gf_interval_table[101] = +{ + 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, +}; + +static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 }; + + +void vp8_save_coding_context(VP8_COMP *cpi) +{ + CODING_CONTEXT *const cc = & cpi->coding_context; + + // Stores a snapshot of key state variables which can subsequently be + // restored with a call to vp8_restore_coding_context. These functions are + // intended for use in a re-code loop in vp8_compress_frame where the + // quantizer value is adjusted between loop iterations. + + cc->frames_since_key = cpi->frames_since_key; + cc->filter_level = cpi->common.filter_level; + cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due; + cc->frames_since_golden = cpi->common.frames_since_golden; + + vp8_copy(cc->mvc, cpi->common.fc.mvc); + vp8_copy(cc->mvcosts, cpi->mb.mvcosts); + + vp8_copy(cc->kf_ymode_prob, cpi->common.kf_ymode_prob); + vp8_copy(cc->ymode_prob, cpi->common.fc.ymode_prob); + vp8_copy(cc->kf_uv_mode_prob, cpi->common.kf_uv_mode_prob); + vp8_copy(cc->uv_mode_prob, cpi->common.fc.uv_mode_prob); + + vp8_copy(cc->ymode_count, cpi->ymode_count); + vp8_copy(cc->uv_mode_count, cpi->uv_mode_count); + + + // Stats +#ifdef MODE_STATS + vp8_copy(cc->y_modes, y_modes); + vp8_copy(cc->uv_modes, uv_modes); + vp8_copy(cc->b_modes, b_modes); + vp8_copy(cc->inter_y_modes, inter_y_modes); + vp8_copy(cc->inter_uv_modes, inter_uv_modes); + vp8_copy(cc->inter_b_modes, inter_b_modes); +#endif + + cc->this_frame_percent_intra = cpi->this_frame_percent_intra; +} + + +void vp8_restore_coding_context(VP8_COMP *cpi) +{ + CODING_CONTEXT *const cc = & cpi->coding_context; + + // Restore key state variables to the snapshot state stored in the + // previous call to vp8_save_coding_context. + + cpi->frames_since_key = cc->frames_since_key; + cpi->common.filter_level = cc->filter_level; + cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due; + cpi->common.frames_since_golden = cc->frames_since_golden; + + vp8_copy(cpi->common.fc.mvc, cc->mvc); + + vp8_copy(cpi->mb.mvcosts, cc->mvcosts); + + vp8_copy(cpi->common.kf_ymode_prob, cc->kf_ymode_prob); + vp8_copy(cpi->common.fc.ymode_prob, cc->ymode_prob); + vp8_copy(cpi->common.kf_uv_mode_prob, cc->kf_uv_mode_prob); + vp8_copy(cpi->common.fc.uv_mode_prob, cc->uv_mode_prob); + + vp8_copy(cpi->ymode_count, cc->ymode_count); + vp8_copy(cpi->uv_mode_count, cc->uv_mode_count); + + // Stats +#ifdef MODE_STATS + vp8_copy(y_modes, cc->y_modes); + vp8_copy(uv_modes, cc->uv_modes); + vp8_copy(b_modes, cc->b_modes); + vp8_copy(inter_y_modes, cc->inter_y_modes); + vp8_copy(inter_uv_modes, cc->inter_uv_modes); + vp8_copy(inter_b_modes, cc->inter_b_modes); +#endif + + + cpi->this_frame_percent_intra = cc->this_frame_percent_intra; +} + + +void vp8_setup_key_frame(VP8_COMP *cpi) +{ + // Setup for Key frame: + + vp8_default_coef_probs(& cpi->common); + vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob); + + vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); + { + int flag[2] = {1, 1}; + vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag); + } + + vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc)); //initialize pre_mvc to all zero. + + //cpi->common.filter_level = 0; // Reset every key frame. + cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ; + + // Provisional interval before next GF + if (cpi->auto_gold) + //cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL; + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + else + cpi->frames_till_gf_update_due = cpi->goldfreq; + + cpi->common.refresh_golden_frame = TRUE; +} + +void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi) +{ + // boost defaults to half second + int kf_boost; + + // Clear down mmx registers to allow floating point in what follows + vp8_clear_system_state(); //__asm emms; + + if (cpi->oxcf.fixed_q >= 0) + { + vp8_calc_iframe_target_size(cpi); + return; + } + + if (cpi->pass == 2) + { + cpi->this_frame_target = cpi->per_frame_bandwidth; // New Two pass RC + } + else + { + // Boost depends somewhat on frame rate + kf_boost = (int)(2 * cpi->output_frame_rate - 16); + + // adjustment up based on q + kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100; + + // frame separation adjustment ( down) + if (cpi->frames_since_key < cpi->output_frame_rate / 2) + kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2)); + + if (kf_boost < 16) + kf_boost = 16; + + // Reset the active worst quality to the baseline value for key frames. + cpi->active_worst_quality = cpi->worst_quality; + + cpi->this_frame_target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + } + + + // Should the next frame be an altref frame + if (cpi->pass != 2) + { + // For now Alt ref is not allowed except in 2 pass modes. + cpi->source_alt_ref_pending = FALSE; + + /*if ( cpi->oxcf.fixed_q == -1) + { + if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) ) + cpi->source_alt_ref_pending = TRUE; + else + cpi->source_alt_ref_pending = FALSE; + }*/ + } + + if (0) + { + FILE *f; + + f = fopen("kf_boost.stt", "a"); + //fprintf(f, " %8d %10d %10d %10d %10d %10d %10d\n", + // cpi->common.current_video_frame, cpi->target_bandwidth, cpi->frames_to_key, kf_boost_qadjustment[cpi->ni_av_qi], cpi->kf_boost, (cpi->this_frame_target *100 / cpi->per_frame_bandwidth), cpi->this_frame_target ); + + fprintf(f, " %8u %10d %10d %10d\n", + cpi->common.current_video_frame, cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending); + + fclose(f); + } +} + +// Do the best we can to define the parameteres for the next GF based on what information we have available. +static void calc_gf_params(VP8_COMP *cpi) +{ + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int Boost = 0; + + int gf_frame_useage = 0; // Golden frame useage since last GF + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; + + int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols); + + // Reset the last boost indicator + //cpi->last_boost = 100; + + if (tot_mbs) + gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs; + + if (pct_gf_active > gf_frame_useage) + gf_frame_useage = pct_gf_active; + + // Not two pass + if (cpi->pass != 2) + { + // Single Pass lagged mode: TBD + if (FALSE) + { + } + + // Single Pass compression: Has to use current and historical data + else + { +#if 0 + // Experimental code + int index = cpi->one_pass_frame_index; + int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS; + + /* + // *************** Experimental code - incomplete + double decay_val = 1.0; + double IIAccumulator = 0.0; + double last_iiaccumulator = 0.0; + double IIRatio; + + cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS; + + for ( i = 0; i < (frames_to_scan - 1); i++ ) + { + if ( index < 0 ) + index = MAX_LAG_BUFFERS; + index --; + + if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 ) + { + IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error; + + if ( IIRatio > 30.0 ) + IIRatio = 30.0; + } + else + IIRatio = 30.0; + + IIAccumulator += IIRatio * decay_val; + + decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter; + + if ( (i > MIN_GF_INTERVAL) && + ((IIAccumulator - last_iiaccumulator) < 2.0) ) + { + break; + } + last_iiaccumulator = IIAccumulator; + } + + Boost = IIAccumulator*100.0/16.0; + cpi->baseline_gf_interval = i; + + */ +#else + + /*************************************************************/ + // OLD code + + // Adjust boost based upon ambient Q + Boost = GFQ_ADJUSTMENT; + + // Adjust based upon most recently measure intra useage + Boost = Boost * vp8_gf_intra_useage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100; + + // Adjust gf boost based upon GF usage since last GF + Boost = Boost * vp8_gf_adjust_table[gf_frame_useage] / 100; +#endif + } + + // golden frame boost without recode loop often goes awry. be safe by keeping numbers down. + if (!cpi->sf.recode_loop) + { + if (cpi->compressor_speed == 2) + Boost = Boost / 2; + } + + // Apply an upper limit based on Q for 1 pass encodes + if (Boost > vp8_kf_gf_boost_qlimits[Q] && (cpi->pass == 0)) + Boost = vp8_kf_gf_boost_qlimits[Q]; + + // Apply lower limits to boost. + else if (Boost < 110) + Boost = 110; + + // Note the boost used + cpi->last_boost = Boost; + + } + + // Estimate next interval + // This is updated once the real frame size/boost is known. + if (cpi->oxcf.fixed_q == -1) + { + if (cpi->pass == 2) // 2 Pass + { + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } + else // 1 Pass + { + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + + if (cpi->last_boost > 750) + cpi->frames_till_gf_update_due++; + + if (cpi->last_boost > 1000) + cpi->frames_till_gf_update_due++; + + if (cpi->last_boost > 1250) + cpi->frames_till_gf_update_due++; + + if (cpi->last_boost >= 1500) + cpi->frames_till_gf_update_due ++; + + if (vp8_gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) + cpi->frames_till_gf_update_due = vp8_gf_interval_table[gf_frame_useage]; + + if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) + cpi->frames_till_gf_update_due = cpi->max_gf_interval; + } + } + else + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + + // ARF on or off + if (cpi->pass != 2) + { + // For now Alt ref is not allowed except in 2 pass modes. + cpi->source_alt_ref_pending = FALSE; + + /*if ( cpi->oxcf.fixed_q == -1) + { + if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) ) + cpi->source_alt_ref_pending = TRUE; + else + cpi->source_alt_ref_pending = FALSE; + }*/ + } +} +/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */ +static int baseline_bits_at_q(int frame_kind, int Q, int MBs) +{ + int Bpm = vp8_bits_per_mb[frame_kind][Q]; + + /* Attempt to retain reasonable accuracy without overflow. The cutoff is + * chosen such that the maximum product of Bpm and MBs fits 31 bits. The + * largest Bpm takes 20 bits. + */ + if (MBs > (1 << 11)) + return (Bpm >> BPER_MB_NORMBITS) * MBs; + else + return (Bpm * MBs) >> BPER_MB_NORMBITS; +} + +void vp8_calc_iframe_target_size(VP8_COMP *cpi) +{ + int Q; + int Boost = 100; + + Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex; + + if (cpi->auto_adjust_key_quantizer == 1) + { + // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames. + // The enhanced Q is calculated so as to boost the key frame size by a factor + // specified in kf_boost_qadjustment. Also, can adjust based on distance + // between key frames. + + // Adjust boost based upon ambient Q + Boost = vp8_kf_boost_qadjustment[Q]; + + // Make the Key frame boost less if the seperation from the previous key frame is small + if (cpi->frames_since_key < 16) + Boost = Boost * vp8_kf_boost_seperationt_adjustment[cpi->frames_since_key] / 100; + else + Boost = Boost * vp8_kf_boost_seperationt_adjustment[15] / 100; + + // Apply limits on boost + if (Boost > vp8_kf_gf_boost_qlimits[Q]) + Boost = vp8_kf_gf_boost_qlimits[Q]; + else if (Boost < 120) + Boost = 120; + } + + // Keep a record of the boost that was used + cpi->last_boost = Boost; + + // Should the next frame be an altref frame + if (cpi->pass != 2) + { + // For now Alt ref is not allowed except in 2 pass modes. + cpi->source_alt_ref_pending = FALSE; + + /*if ( cpi->oxcf.fixed_q == -1) + { + if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) ) + cpi->source_alt_ref_pending = TRUE; + else + cpi->source_alt_ref_pending = FALSE; + }*/ + } + + if (cpi->oxcf.fixed_q >= 0) + { + cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100; + } + else + { + + int bits_per_mb_at_this_q ; + + if (cpi->oxcf.error_resilient_mode == 1) + { + cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth; + return; + } + + // Rate targetted scenario: + // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target + bits_per_mb_at_this_q = (int)(.5 + + cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]); + + cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100; + + // Reset the active worst quality to the baseline value for key frames. + if (cpi->pass < 2) + cpi->active_worst_quality = cpi->worst_quality; + } +} + + + +void vp8_calc_pframe_target_size(VP8_COMP *cpi) +{ + int min_frame_target; + int Adjustment; + + // Set the min frame bandwidth. + //min_frame_target = estimate_min_frame_size( cpi ); + min_frame_target = 0; + + if (cpi->pass == 2) + { + min_frame_target = cpi->min_frame_bandwidth; + + if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5)) + min_frame_target = cpi->av_per_frame_bandwidth >> 5; + } + else if (min_frame_target < cpi->per_frame_bandwidth / 4) + min_frame_target = cpi->per_frame_bandwidth / 4; + + + // Special alt reference frame case + if (cpi->common.refresh_alt_ref_frame) + { + if (cpi->pass == 2) + { + cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for the alt ref frame + cpi->this_frame_target = cpi->per_frame_bandwidth; + } + + /* One Pass ??? TBD */ + /*else + { + int frames_in_section; + int allocation_chunks; + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int alt_boost; + int max_arf_rate; + + alt_boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); + alt_boost += (cpi->frames_till_gf_update_due * 50); + + // If alt ref is not currently active then we have a pottential double hit with GF and ARF so reduce the boost a bit. + // A similar thing is done on GFs that preceed a arf update. + if ( !cpi->source_alt_ref_active ) + alt_boost = alt_boost * 3 / 4; + + frames_in_section = cpi->frames_till_gf_update_due+1; // Standard frames + GF + allocation_chunks = (frames_in_section * 100) + alt_boost; + + // Normalize Altboost and allocations chunck down to prevent overflow + while ( alt_boost > 1000 ) + { + alt_boost /= 2; + allocation_chunks /= 2; + } + + else + { + int bits_in_section; + + if ( cpi->kf_overspend_bits > 0 ) + { + Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits; + + if ( Adjustment > (cpi->per_frame_bandwidth - min_frame_target) ) + Adjustment = (cpi->per_frame_bandwidth - min_frame_target); + + cpi->kf_overspend_bits -= Adjustment; + + // Calculate an inter frame bandwidth target for the next few frames designed to recover + // any extra bits spent on the key frame. + cpi->inter_frame_target = cpi->per_frame_bandwidth - Adjustment; + if ( cpi->inter_frame_target < min_frame_target ) + cpi->inter_frame_target = min_frame_target; + } + else + cpi->inter_frame_target = cpi->per_frame_bandwidth; + + bits_in_section = cpi->inter_frame_target * frames_in_section; + + // Avoid loss of precision but avoid overflow + if ( (bits_in_section>>7) > allocation_chunks ) + cpi->this_frame_target = alt_boost * (bits_in_section / allocation_chunks); + else + cpi->this_frame_target = (alt_boost * bits_in_section) / allocation_chunks; + } + } + */ + } + + // Normal frames (gf,and inter) + else + { + // 2 pass + if (cpi->pass == 2) + { + cpi->this_frame_target = cpi->per_frame_bandwidth; + } + // 1 pass + else + { + // Make rate adjustment to recover bits spent in key frame + // Test to see if the key frame inter data rate correction should still be in force + if (cpi->kf_overspend_bits > 0) + { + Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits; + + if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target)) + Adjustment = (cpi->per_frame_bandwidth - min_frame_target); + + cpi->kf_overspend_bits -= Adjustment; + + // Calculate an inter frame bandwidth target for the next few frames designed to recover + // any extra bits spent on the key frame. + cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment; + + if (cpi->this_frame_target < min_frame_target) + cpi->this_frame_target = min_frame_target; + } + else + cpi->this_frame_target = cpi->per_frame_bandwidth; + + // If appropriate make an adjustment to recover bits spent on a recent GF + if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target)) + { + int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits; + + if (Adjustment > (cpi->this_frame_target - min_frame_target)) + Adjustment = (cpi->this_frame_target - min_frame_target); + + cpi->gf_overspend_bits -= Adjustment; + cpi->this_frame_target -= Adjustment; + } + + // Apply small + and - boosts for non gf frames + if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) && + (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1))) + { + // % Adjustment limited to the range 1% to 10% + Adjustment = (cpi->last_boost - 100) >> 5; + + if (Adjustment < 1) + Adjustment = 1; + else if (Adjustment > 10) + Adjustment = 10; + + // Convert to bits + Adjustment = (cpi->this_frame_target * Adjustment) / 100; + + if (Adjustment > (cpi->this_frame_target - min_frame_target)) + Adjustment = (cpi->this_frame_target - min_frame_target); + + if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1)) + cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment); + else + cpi->this_frame_target -= Adjustment; + } + } + } + + // Set a reduced data rate target for our initial Q calculation. + // This should help to save bits during earier sections. + if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100)) + cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100; + + // Sanity check that the total sum of adjustments is not above the maximum allowed + // That is that having allowed for KF and GF penalties we have not pushed the + // current interframe target to low. If the adjustment we apply here is not capable of recovering + // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over + // a longer time span via other buffer / rate control mechanisms. + if (cpi->this_frame_target < min_frame_target) + cpi->this_frame_target = min_frame_target; + + if (!cpi->common.refresh_alt_ref_frame) + // Note the baseline target data rate for this inter frame. + cpi->inter_frame_target = cpi->this_frame_target; + + // One Pass specific code + if (cpi->pass == 0) + { + // Adapt target frame size with respect to any buffering constraints: + if (cpi->buffered_mode) + { + int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100; + + if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) + { + int percent_low = 0; + + // Decide whether or not we need to adjust the frame data rate target. + // + // If we are are below the optimal buffer fullness level and adherence + // to buffering contraints is important to the end useage then adjust + // the per frame target. + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) + { + percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits; + + if (percent_low > 100) + percent_low = 100; + else if (percent_low < 0) + percent_low = 0; + } + // Are we overshooting the long term clip data rate... + else if (cpi->bits_off_target < 0) + { + // Adjust per frame data target downwards to compensate. + percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); + + if (percent_low > 100) + percent_low = 100; + else if (percent_low < 0) + percent_low = 0; + } + + // lower the target bandwidth for this frame. + cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; + + // Are we using allowing control of active_worst_allowed_q according to buffer level. + if (cpi->auto_worst_q) + { + int critical_buffer_level; + + // For streaming applications the most important factor is cpi->buffer_level as this takes + // into account the specified short term buffering constraints. However, hitting the long + // term clip data rate target is also important. + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + // Take the smaller of cpi->buffer_level and cpi->bits_off_target + critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target; + } + // For local file playback short term buffering contraints are less of an issue + else + { + // Consider only how we are doing for the clip as a whole + critical_buffer_level = cpi->bits_off_target; + } + + // Set the active worst quality based upon the selected buffer fullness number. + if (critical_buffer_level < cpi->oxcf.optimal_buffer_level) + { + if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4)) + { + int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi; + int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4)); + + // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level) + // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4) + cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4)); + } + else + { + cpi->active_worst_quality = cpi->worst_quality; + } + } + else + { + cpi->active_worst_quality = cpi->ni_av_qi; + } + } + else + { + cpi->active_worst_quality = cpi->worst_quality; + } + } + else + { + int percent_high; + + if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) + { + percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8)); + + if (percent_high > 100) + percent_high = 100; + else if (percent_high < 0) + percent_high = 0; + + cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100; + + } + + // Are we allowing control of active_worst_allowed_q according to bufferl level. + if (cpi->auto_worst_q) + { + // When using the relaxed buffer model stick to the user specified value + cpi->active_worst_quality = cpi->ni_av_qi; + } + else + { + cpi->active_worst_quality = cpi->worst_quality; + } + } + + // Set active_best_quality to prevent quality rising too high + cpi->active_best_quality = cpi->best_quality; + + // Worst quality obviously must not be better than best quality + if (cpi->active_worst_quality <= cpi->active_best_quality) + cpi->active_worst_quality = cpi->active_best_quality + 1; + + } + // Unbuffered mode (eg. video conferencing) + else + { + // Set the active worst quality + cpi->active_worst_quality = cpi->worst_quality; + } + } + + // Test to see if we have to drop a frame + // The auto-drop frame code is only used in buffered mode. + // In unbufferd mode (eg vide conferencing) the descision to + // code or drop a frame is made outside the codec in response to real + // world comms or buffer considerations. + if (cpi->drop_frames_allowed && cpi->buffered_mode && + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + ((cpi->common.frame_type != KEY_FRAME))) //|| !cpi->oxcf.allow_spatial_resampling) ) + { + // Check for a buffer underun-crisis in which case we have to drop a frame + if ((cpi->buffer_level < 0)) + { +#if 0 + FILE *f = fopen("dec.stt", "a"); + fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n", + (int) cpi->common.current_video_frame, + cpi->decimation_factor, cpi->common.horiz_scale, + (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level); + fclose(f); +#endif + //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth); + + cpi->drop_frame = TRUE; + } + +#if 0 + // Check for other drop frame crtieria (Note 2 pass cbr uses decimation on whole KF sections) + else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) && + (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0)) + { + cpi->drop_frame = TRUE; + } + +#endif + + if (cpi->drop_frame) + { + // Update the buffer level variable. + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + cpi->buffer_level = cpi->bits_off_target; + } + else + cpi->drop_count = 0; + } + + // Adjust target frame size for Golden Frames: + if (cpi->oxcf.error_resilient_mode == 0 && + (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) + { + //int Boost = 0; + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + + int gf_frame_useage = 0; // Golden frame useage since last GF + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; + + int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols); + + // Reset the last boost indicator + //cpi->last_boost = 100; + + if (tot_mbs) + gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs; + + if (pct_gf_active > gf_frame_useage) + gf_frame_useage = pct_gf_active; + + // Is a fixed manual GF frequency being used + if (!cpi->auto_gold) + cpi->common.refresh_golden_frame = TRUE; + else + { + // For one pass throw a GF if recent frame intra useage is low or the GF useage is high + if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) + cpi->common.refresh_golden_frame = TRUE; + + // Two pass GF descision + else if (cpi->pass == 2) + cpi->common.refresh_golden_frame = TRUE; + } + +#if 0 + + // Debug stats + if (0) + { + FILE *f; + + f = fopen("gf_useaget.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + fclose(f); + } + +#endif + + if (cpi->common.refresh_golden_frame == TRUE) + { + int isize_adjustment = 0; +#if 0 + + if (0) // p_gw + { + FILE *f; + + f = fopen("GFexit.stt", "a"); + fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame); + fclose(f); + } + +#endif + cpi->initial_gf_use = 0; + + if (cpi->auto_adjust_gold_quantizer) + { + calc_gf_params(cpi); + } + + // If we are using alternate ref instead of gf then do not apply the boost + // It will instead be applied to the altref update + // Jims modified boost + if (!cpi->source_alt_ref_active) + { + if (cpi->oxcf.fixed_q < 0) + { + if (cpi->pass == 2) + { + cpi->this_frame_target = cpi->per_frame_bandwidth; // The spend on the GF is defined in the two pass code for two pass encodes + } + else + { + int Boost = cpi->last_boost; + int frames_in_section = cpi->frames_till_gf_update_due + 1; + int allocation_chunks = (frames_in_section * 100) + (Boost - 100); + int bits_in_section = cpi->inter_frame_target * frames_in_section; + + // Normalize Altboost and allocations chunck down to prevent overflow + while (Boost > 1000) + { + Boost /= 2; + allocation_chunks /= 2; + } + + // Avoid loss of precision but avoid overflow + if ((bits_in_section >> 7) > allocation_chunks) + cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks); + else + cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks; + } + } + else + cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100; + + } + // If there is an active ARF at this location use the minimum bits on this frame + else + { + cpi->this_frame_target = 0; // Minimial spend on gf that is replacing an arf + } + + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + + } + } +} + + +void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) +{ + int Q = cpi->common.base_qindex; + int correction_factor = 100; + double rate_correction_factor; + double adjustment_limit; + + int projected_size_based_on_q = 0; + + // Clear down mmx registers to allow floating point in what follows + vp8_clear_system_state(); //__asm emms; + + if (cpi->common.frame_type == KEY_FRAME) + { + rate_correction_factor = cpi->key_frame_rate_correction_factor; + } + else + { + if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + rate_correction_factor = cpi->gf_rate_correction_factor; + else + rate_correction_factor = cpi->rate_correction_factor; + } + + // Work out how big we would have expected the frame to be at this Q given the current correction factor. + // Stay in double to avoid int overflow when values are large + //projected_size_based_on_q = ((int)(.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) >> BPER_MB_NORMBITS; + projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); + + // Make some allowance for cpi->zbin_over_quant + if (cpi->zbin_over_quant > 0) + { + int Z = cpi->zbin_over_quant; + double Factor = 0.99; + double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX; + + while (Z > 0) + { + Z --; + projected_size_based_on_q *= (int)Factor; + Factor += factor_adjustment; + + if (Factor >= 0.999) + Factor = 0.999; + } + } + + // Work out a size correction factor. + //if ( cpi->this_frame_target > 0 ) + // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target; + if (projected_size_based_on_q > 0) + correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q; + + // More heavily damped adjustment used if we have been oscillating either side of target + switch (damp_var) + { + case 0: + adjustment_limit = 0.75; + break; + case 1: + adjustment_limit = 0.375; + break; + case 2: + default: + adjustment_limit = 0.25; + break; + } + + //if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) ) + if (correction_factor > 102) + { + // We are not already at the worst allowable quality + correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = ((rate_correction_factor * correction_factor) / 100); + + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } + //else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) ) + else if (correction_factor < 99) + { + // We are not already at the best allowable quality + correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = ((rate_correction_factor * correction_factor) / 100); + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + if (cpi->common.frame_type == KEY_FRAME) + cpi->key_frame_rate_correction_factor = rate_correction_factor; + else + { + if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + cpi->gf_rate_correction_factor = rate_correction_factor; + else + cpi->rate_correction_factor = rate_correction_factor; + } +} + +static int estimate_bits_at_q(VP8_COMP *cpi, int Q) +{ + int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]); + + /* Attempt to retain reasonable accuracy without overflow. The cutoff is + * chosen such that the maximum product of Bpm and MBs fits 31 bits. The + * largest Bpm takes 20 bits. + */ + if (cpi->common.MBs > (1 << 11)) + return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs; + else + return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS; + +} + + +int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) +{ + int Q = cpi->active_worst_quality; + + // Reset Zbin OQ value + cpi->zbin_over_quant = 0; + + if (cpi->oxcf.fixed_q >= 0) + { + Q = cpi->oxcf.fixed_q; + + if (cpi->common.frame_type == KEY_FRAME) + { + Q = cpi->oxcf.key_q; + } + else if (cpi->common.refresh_alt_ref_frame) + { + Q = cpi->oxcf.alt_q; + } + else if (cpi->common.refresh_golden_frame) + { + Q = cpi->oxcf.gold_q; + } + + } + else + { + int i; + int last_error = INT_MAX; + int target_bits_per_mb; + int bits_per_mb_at_this_q; + double correction_factor; + + // Select the appropriate correction factor based upon type of frame. + if (cpi->common.frame_type == KEY_FRAME) + correction_factor = cpi->key_frame_rate_correction_factor; + else + { + if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + correction_factor = cpi->gf_rate_correction_factor; + else + correction_factor = cpi->rate_correction_factor; + } + + // Calculate required scaling factor based on target frame size and size of frame produced using previous Q + if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) + target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int + else + target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; + + i = cpi->active_best_quality; + + do + { + bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]); + + if (bits_per_mb_at_this_q <= target_bits_per_mb) + { + if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) + Q = i; + else + Q = i - 1; + + break; + } + else + last_error = bits_per_mb_at_this_q - target_bits_per_mb; + } + while (++i <= cpi->active_worst_quality); + + + // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like + // the RD multiplier and zero bin size. + if (Q >= MAXQ) + { + int zbin_oqmax; + + double Factor = 0.99; + double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX; + + if (cpi->common.frame_type == KEY_FRAME) + zbin_oqmax = 0; //ZBIN_OQ_MAX/16 + else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active)) + zbin_oqmax = 16; + else + zbin_oqmax = ZBIN_OQ_MAX; + + /*{ + double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q; + double Oq; + + Factor = Factor/1.2683; + + Oq = pow( Factor, (1.0/-0.165) ); + + if ( Oq > zbin_oqmax ) + Oq = zbin_oqmax; + + cpi->zbin_over_quant = (int)Oq; + }*/ + + // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true. + // The effect will be highly clip dependent and may well have sudden steps. + // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero + // bin and hence decreasing the number of low magnitude non zero coefficients. + while (cpi->zbin_over_quant < zbin_oqmax) + { + cpi->zbin_over_quant ++; + + if (cpi->zbin_over_quant > zbin_oqmax) + cpi->zbin_over_quant = zbin_oqmax; + + bits_per_mb_at_this_q *= (int)Factor; // Each over-ruin step is assumed to equate to approximately 3% reduction in bitrate + Factor += factor_adjustment; + + if (Factor >= 0.999) + Factor = 0.999; + + if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate + break; + } + + } + } + + return Q; +} + +static int estimate_min_frame_size(VP8_COMP *cpi) +{ + double correction_factor; + int bits_per_mb_at_max_q; + + // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt. + if (cpi->common.current_video_frame < 10) + { + if (cpi->pass == 2) + return (cpi->min_frame_bandwidth); + else + return cpi->per_frame_bandwidth / 3; + } + + /* // Select the appropriate correction factor based upon type of frame. + if ( cpi->common.frame_type == KEY_FRAME ) + correction_factor = cpi->key_frame_rate_correction_factor; + else + { + if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame ) + correction_factor = cpi->gf_rate_correction_factor; + else + correction_factor = cpi->rate_correction_factor; + }*/ + + // We estimate at half the value we get from vp8_bits_per_mb + correction_factor = cpi->rate_correction_factor / 2.0; + + bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]); + + return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS; +} + +void vp8_adjust_key_frame_context(VP8_COMP *cpi) +{ + int i; + int av_key_frames_per_second; + + // Average key frame frequency and size + unsigned int total_weight = 0; + unsigned int av_key_frame_frequency = 0; + unsigned int av_key_frame_bits = 0; + + unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate); + unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth); + + // Clear down mmx registers to allow floating point in what follows + vp8_clear_system_state(); //__asm emms; + + // Update the count of total key frame bits + cpi->tot_key_frame_bits += cpi->projected_frame_size; + + // First key frame at start of sequence is a special case. We have no frequency data. + if (cpi->key_frame_count == 1) + { + av_key_frame_frequency = (int)cpi->output_frame_rate * 2; // Assume a default of 1 kf every 2 seconds + av_key_frame_bits = cpi->projected_frame_size; + av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; // Note output_frame_rate not cpi->output_frame_rate + } + else + { + // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes + for (i = 0; i < KEY_FRAME_CONTEXT; i++) + { + if (i < KEY_FRAME_CONTEXT - 1) + { + cpi->prior_key_frame_size[i] = cpi->prior_key_frame_size[i+1]; + cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1]; + } + else + { + cpi->prior_key_frame_size[KEY_FRAME_CONTEXT - 1] = cpi->projected_frame_size; + cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] = cpi->frames_since_key; + } + + av_key_frame_bits += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i]; + av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i]; + total_weight += prior_key_frame_weight[i]; + } + + av_key_frame_bits /= total_weight; + av_key_frame_frequency /= total_weight; + av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; + + } + + // Do we have any key frame overspend to recover? + if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth)) + { + // Update the count of key frame overspend to be recovered in subsequent frames + // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly) + // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits + // allocated than those following other gfs. + cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8; + cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8; + + // Work out how much to try and recover per frame. + // For one pass we estimate the number of frames to spread it over based upon past history. + // For two pass we know how many frames there will be till the next kf. + if (cpi->pass == 2) + { + if (cpi->frames_to_key > 16) + cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key; + else + cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16; + } + else + cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency; + } + + cpi->frames_since_key = 0; + cpi->last_key_frame_size = cpi->projected_frame_size; + cpi->key_frame_count++; +} + +void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit) +{ + // Set-up bounds on acceptable frame size: + if (cpi->oxcf.fixed_q >= 0) + { + // Fixed Q scenario: frame size never outranges target (there is no target!) + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } + else + { + if (cpi->common.frame_type == KEY_FRAME) + { + *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; + } + else + { + if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + { + *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; + } + else + { + // For CBR take buffer fullness into account + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + { + if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1)) + { + // Buffer is too full so relax overshoot and tighten undershoot + *frame_over_shoot_limit = cpi->this_frame_target * 12 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8; + } + else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1)) + { + // Buffer is too low so relax undershoot and tighten overshoot + *frame_over_shoot_limit = cpi->this_frame_target * 10 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8; + } + else + { + *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8; + } + } + // VBR + // Note that tighter restrictions here can help quality but hurt encode speed + else + { + *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; + *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8; + } + } + } + } +} diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h new file mode 100644 index 000000000..588c7a823 --- /dev/null +++ b/vp8/encoder/ratectrl.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if !defined __INC_RATECTRL_H + +#include "onyx_int.h" + +extern void vp8_save_coding_context(VP8_COMP *cpi); +extern void vp8_restore_coding_context(VP8_COMP *cpi); + +extern void vp8_setup_key_frame(VP8_COMP *cpi); +extern void vp8_calc_iframe_target_size(VP8_COMP *cpi); +extern void vp8_calc_pframe_target_size(VP8_COMP *cpi); +extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var); +extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame); +extern void vp8_adjust_key_frame_context(VP8_COMP *cpi); +extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit); + +#endif diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c new file mode 100644 index 000000000..084699628 --- /dev/null +++ b/vp8/encoder/rdopt.c @@ -0,0 +1,2212 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdio.h> +#include <math.h> +#include <limits.h> +#include <assert.h> +#include "pragmas.h" + +#include "tokenize.h" +#include "treewriter.h" +#include "onyx_int.h" +#include "modecosts.h" +#include "encodeintra.h" +#include "entropymode.h" +#include "reconinter.h" +#include "reconintra.h" +#include "reconintra4x4.h" +#include "findnearmv.h" +#include "encodemb.h" +#include "quantize.h" +#include "idct.h" +#include "g_common.h" +#include "variance.h" +#include "mcomp.h" + +#include "vpx_mem/vpx_mem.h" +#include "dct.h" +#include "systemdependent.h" + +#define DIAMONDSEARCH 1 +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif + + +void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); + + +#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) +/*int RDFUNC( int RM,int DM, int R, int D, int target_r ) +{ + int rd_value; + + rd_value = ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ); + + return rd_value; +}*/ + +#define UVRDFUNC(RM,DM,R,D,target_r) RDFUNC(RM,DM,R,D,target_r) + +#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) + +#define MAXF(a,b) (((a) > (b)) ? (a) : (b)) + + +extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2]; +extern const TOKENEXTRA *vp8_dct_value_tokens_ptr; +extern int vp8_dct_value_cost[DCT_MAX_VALUE*2]; +extern int *vp8_dct_value_cost_ptr; + + +const int vp8_auto_speed_thresh[17] = +{ + 1000, + 200, + 150, + 130, + 150, + 125, + 120, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 105 +}; + +const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] = +{ + ZEROMV, + DC_PRED, + + NEARESTMV, + NEARMV, + + ZEROMV, + NEARESTMV, + + ZEROMV, + NEARESTMV, + + NEARMV, + NEARMV, + + V_PRED, + H_PRED, + TM_PRED, + + NEWMV, + NEWMV, + NEWMV, + + SPLITMV, + SPLITMV, + SPLITMV, + + B_PRED, +}; + +const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] = +{ + LAST_FRAME, + INTRA_FRAME, + + LAST_FRAME, + LAST_FRAME, + + GOLDEN_FRAME, + GOLDEN_FRAME, + + ALTREF_FRAME, + ALTREF_FRAME, + + GOLDEN_FRAME, + ALTREF_FRAME, + + INTRA_FRAME, + INTRA_FRAME, + INTRA_FRAME, + + LAST_FRAME, + GOLDEN_FRAME, + ALTREF_FRAME, + + LAST_FRAME, + GOLDEN_FRAME, + ALTREF_FRAME, + + INTRA_FRAME, +}; + +static void fill_token_costs( + unsigned int c [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens], + const vp8_prob p [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] +) +{ + int i, j, k; + + + for (i = 0; i < BLOCK_TYPES; i++) + for (j = 0; j < COEF_BANDS; j++) + for (k = 0; k < PREV_COEF_CONTEXTS; k++) + + vp8_cost_tokens((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree); + +} + +static int rd_iifactor [ 32 ] = { 16, 16, 16, 12, 8, 4, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + + + +// The values in this table should be reviewed +static int sad_per_bit16lut[128] = +{ + 4, 4, 4, 4, 4, 4, 4, 4, // 4 + 4, 4, 4, 4, 4, 4, 4, 4, // 1 + 4, 4, 4, 4, 4, 4, 4, 4, // 2 + 4, 4, 4, 4, 4, 4, 4, 4, // 3 + 4, 4, 4, 4, 4, 4, 4, 4, // 4 + 4, 4, 12, 12, 13, 13, 14, 14, // 5 + 14, 14, 14, 15, 15, 15, 15, 15, // 6 + 15, 15, 15, 15, 15, 15, 15, 15, // 7 + 15, 15, 15, 15, 15, 16, 16, 16, // 8 + 16, 16, 18, 18, 18, 18, 19, 19, // 9 + 19, 19, 19, 19, 19, 19, 19, 19, // 10 + 20, 20, 22, 22, 22, 22, 21, 21, // 11 + 22, 22, 22, 22, 22, 22, 22, 22, // 12 + 22, 22, 22, 22, 22, 22, 22, 22, // 13 + 22, 22, 22, 22, 22, 22, 22, 22, // 14 + 22, 22, 22, 22, 22, 22, 22, 22, // 15 +}; + +static int sad_per_bit4lut[128] = +{ + 4, 4, 4, 4, 4, 4, 4, 4, // 4 + 4, 4, 4, 4, 4, 4, 4, 4, // 1 + 4, 4, 4, 4, 4, 4, 4, 4, // 2 + 4, 4, 4, 4, 4, 4, 4, 4, // 3 + 4, 4, 4, 4, 4, 4, 4, 4, // 4 + 4, 4, 15, 15, 15, 15, 16, 16, // 5 + 16, 17, 17, 17, 17, 17, 17, 17, // 6 + 17, 17, 19, 19, 22, 22, 21, 21, // 7 + 23, 23, 23, 23, 23, 24, 24, 24, // 8 + 25, 25, 27, 27, 27, 27, 28, 28, // 9 + 28, 28, 29, 29, 29, 29, 29, 29, // 10 + 30, 30, 31, 31, 31, 31, 32, 32, // 11 + 34, 34, 34, 34, 34, 34, 34, 34, // 12 + 34, 34, 34, 34, 34, 34, 34, 34, // 13 + 34, 34, 34, 34, 34, 34, 34, 34, // 14 + 34, 34, 34, 34, 34, 34, 34, 34, // 15 +}; + +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex) +{ + cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex]; + cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex]; +} + +void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue) +{ + int q; + int i; + int *thresh; + int threshmult; + + int capped_q = (Qvalue < 160) ? Qvalue : 160; + + vp8_clear_system_state(); //__asm emms; + + cpi->RDMULT = (int)((0.00007 * (capped_q * capped_q * capped_q * capped_q)) - (0.0125 * (capped_q * capped_q * capped_q)) + + (2.25 * (capped_q * capped_q)) - (12.5 * capped_q) + 25.0); + + if (cpi->RDMULT < 25) + cpi->RDMULT = 25; + + if (cpi->pass == 2) + { + if (cpi->common.frame_type == KEY_FRAME) + cpi->RDMULT += (cpi->RDMULT * rd_iifactor[0]) / 16; + else if (cpi->next_iiratio > 31) + cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) / 16; + else + cpi->RDMULT += (cpi->RDMULT * rd_iifactor[cpi->next_iiratio]) / 16; + } + + + // Extend rate multiplier along side quantizer zbin increases + if (cpi->zbin_over_quant > 0) + { + // Extend rate multiplier along side quantizer zbin increases + if (cpi->zbin_over_quant > 0) + { + double oq_factor = pow(1.006, cpi->zbin_over_quant); + + if (oq_factor > (1.0 + ((double)cpi->zbin_over_quant / 64.0))) + oq_factor = (1.0 + (double)cpi->zbin_over_quant / 64.0); + + cpi->RDMULT *= (int)oq_factor; + } + } + + cpi->mb.errorperbit = (cpi->RDMULT / 100); + + if (cpi->mb.errorperbit < 1) + cpi->mb.errorperbit = 1; + + vp8_set_speed_features(cpi); + + if (cpi->common.simpler_lpf) + cpi->common.filter_type = SIMPLE_LOOPFILTER; + + q = (int)pow(Qvalue, 1.25); + + if (q < 8) + q = 8; + + if (cpi->ref_frame_flags == VP8_ALT_FLAG) + { + thresh = &cpi->rd_threshes[THR_NEWA]; + threshmult = cpi->sf.thresh_mult[THR_NEWA]; + } + else if (cpi->ref_frame_flags == VP8_GOLD_FLAG) + { + thresh = &cpi->rd_threshes[THR_NEWG]; + threshmult = cpi->sf.thresh_mult[THR_NEWG]; + } + else + { + thresh = &cpi->rd_threshes[THR_NEWMV]; + threshmult = cpi->sf.thresh_mult[THR_NEWMV]; + } + + if (cpi->RDMULT > 1000) + { + cpi->RDDIV = 1; + cpi->RDMULT /= 100; + + for (i = 0; i < MAX_MODES; i++) + { + if (cpi->sf.thresh_mult[i] < INT_MAX) + { + cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100; + } + else + { + cpi->rd_threshes[i] = INT_MAX; + } + + cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i]; + } + } + else + { + cpi->RDDIV = 100; + + for (i = 0; i < MAX_MODES; i++) + { + if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) + { + cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q; + } + else + { + cpi->rd_threshes[i] = INT_MAX; + } + + cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i]; + } + } + + fill_token_costs( + cpi->mb.token_costs, + (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs + ); + + vp8_init_mode_costs(cpi); + +} + +void vp8_auto_select_speed(VP8_COMP *cpi) +{ + int used = cpi->oxcf.cpu_used; + + int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate); + + milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16; + +#if 0 + + if (0) + { + FILE *f; + + f = fopen("speed.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time); + fclose(f); + } + +#endif + + /* + // this is done during parameter valid check + if( used > 16) + used = 16; + if( used < -16) + used = -16; + */ + + if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress) + { + if (cpi->avg_pick_mode_time == 0) + { + cpi->Speed = 4; + } + else + { + if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) + { + cpi->Speed += 2; + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + + if (cpi->Speed > 16) + { + cpi->Speed = 16; + } + } + + if (milliseconds_for_compress * 100 > cpi->avg_encode_time * vp8_auto_speed_thresh[cpi->Speed]) + { + cpi->Speed -= 1; + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + + // In real-time mode, cpi->speed is in [4, 16]. + if (cpi->Speed < 4) //if ( cpi->Speed < 0 ) + { + cpi->Speed = 4; //cpi->Speed = 0; + } + } + } + } + else + { + cpi->Speed += 4; + + if (cpi->Speed > 16) + cpi->Speed = 16; + + + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + } +} + +int vp8_block_error_c(short *coeff, short *dqcoeff) +{ + int i; + int error = 0; + + for (i = 0; i < 16; i++) + { + int this_diff = coeff[i] - dqcoeff[i]; + error += this_diff * this_diff; + } + + return error; +} + +int vp8_mbblock_error_c(MACROBLOCK *mb, int dc) +{ + BLOCK *be; + BLOCKD *bd; + int i, j; + int berror, error = 0; + + for (i = 0; i < 16; i++) + { + be = &mb->block[i]; + bd = &mb->e_mbd.block[i]; + + berror = 0; + + for (j = dc; j < 16; j++) + { + int this_diff = be->coeff[j] - bd->dqcoeff[j]; + berror += this_diff * this_diff; + } + + error += berror; + } + + return error; +} + +int vp8_mbuverror_c(MACROBLOCK *mb) +{ + + BLOCK *be; + BLOCKD *bd; + + + int i; + int error = 0; + + for (i = 16; i < 24; i++) + { + be = &mb->block[i]; + bd = &mb->e_mbd.block[i]; + + error += vp8_block_error_c(be->coeff, bd->dqcoeff); + } + + return error; +} + +#if !(CONFIG_REALTIME_ONLY) +static int macro_block_max_error(MACROBLOCK *mb) +{ + int error = 0; + int dc = 0; + BLOCK *be; + int i, j; + int berror; + + dc = !(mb->e_mbd.mbmi.mode == B_PRED || mb->e_mbd.mbmi.mode == SPLITMV); + + for (i = 0; i < 16; i++) + { + be = &mb->block[i]; + + berror = 0; + + for (j = dc; j < 16; j++) + { + int this_diff = be->coeff[j]; + berror += this_diff * this_diff; + } + + error += berror; + } + + for (i = 16; i < 24; i++) + { + be = &mb->block[i]; + berror = 0; + + for (j = 0; j < 16; j++) + { + int this_diff = be->coeff[j]; + berror += this_diff * this_diff; + } + + error += berror; + } + + error <<= 2; + + if (dc) + { + be = &mb->block[24]; + berror = 0; + + for (j = 0; j < 16; j++) + { + int this_diff = be->coeff[j]; + berror += this_diff * this_diff; + } + + error += berror; + } + + error >>= 4; + return error; +} +#endif + +int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src); + unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src); + int uv_stride = x->block[16].src_stride; + + unsigned int sse1 = 0; + unsigned int sse2 = 0; + int mv_row; + int mv_col; + int offset; + int pre_stride = x->e_mbd.block[16].pre_stride; + + vp8_build_uvmvs(&x->e_mbd, 0); + mv_row = x->e_mbd.block[16].bmi.mv.as_mv.row; + mv_col = x->e_mbd.block[16].bmi.mv.as_mv.col; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->e_mbd.pre.u_buffer + offset; + vptr = x->e_mbd.pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2); + VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1); + sse2 += sse1; + } + else + { + VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2); + VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1); + sse2 += sse1; + } + + return sse2; + +} + +#if !(CONFIG_REALTIME_ONLY) +static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) +{ + int c = !type; /* start at coef 0, unless Y with Y2 */ + int eob = b->eob; + int pt ; /* surrounding block/prev coef predictor */ + int cost = 0; + short *qcoeff_ptr = b->qcoeff; + + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + +# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] ) + + for (; c < eob; c++) + { + int v = QC(c); + int t = vp8_dct_value_tokens_ptr[v].Token; + cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t]; + cost += vp8_dct_value_cost_ptr[v]; + pt = vp8_prev_token_class[t]; + } + +# undef QC + + if (c < 16) + cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN]; + + pt = (c != !type); // is eob first coefficient; + *a = *l = pt; + + return cost; +} + +int vp8_rdcost_mby(MACROBLOCK *mb) +{ + int cost = 0; + int b; + TEMP_CONTEXT t, t2; + int type = 0; + + MACROBLOCKD *x = &mb->e_mbd; + + vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4); + vp8_setup_temp_context(&t2, x->above_context[Y2CONTEXT], x->left_context[Y2CONTEXT], 1); + + if (x->mbmi.mode == SPLITMV) + type = 3; + + for (b = 0; b < 16; b++) + cost += cost_coeffs(mb, x->block + b, type, + t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + + if (x->mbmi.mode != SPLITMV) + cost += cost_coeffs(mb, x->block + 24, 1, + t2.a + vp8_block2above[24], t2.l + vp8_block2left[24]); + + return cost; +} + + +static void rd_pick_intra4x4block( + VP8_COMP *cpi, + MACROBLOCK *x, + BLOCK *be, + BLOCKD *b, + B_PREDICTION_MODE *best_mode, + B_PREDICTION_MODE above, + B_PREDICTION_MODE left, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + + int *bestrate, + int *bestratey, + int *bestdistortion) +{ + B_PREDICTION_MODE mode; + int best_rd = INT_MAX; // 1<<30 + int rate = 0; + int distortion; + unsigned int *mode_costs; + + ENTROPY_CONTEXT ta = *a, tempa = *a; + ENTROPY_CONTEXT tl = *l, templ = *l; + + + if (x->e_mbd.frame_type == KEY_FRAME) + { + mode_costs = x->bmode_costs[above][left]; + } + else + { + mode_costs = x->inter_bmode_costs; + } + + for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) + { + int this_rd; + int ratey; + + rate = mode_costs[mode]; + vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode); + + tempa = ta; + templ = tl; + + ratey = cost_coeffs(x, b, 3, &tempa, &templ); + rate += ratey; + distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2; + + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) + { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + *a = tempa; + *l = templ; + } + } + + b->bmi.mode = (B_PREDICTION_MODE)(*best_mode); + vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode); + +} + + +int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion) +{ + MACROBLOCKD *const xd = &mb->e_mbd; + int i; + TEMP_CONTEXT t; + int cost = mb->mbmode_cost [xd->frame_type] [B_PRED]; + int distortion = 0; + int tot_rate_y = 0; + + vp8_intra_prediction_down_copy(xd); + vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4); + + for (i = 0; i < 16; i++) + { + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode; + const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode; + B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); + int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d); + + rd_pick_intra4x4block( + cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L, + t.a + vp8_block2above[i], + t.l + vp8_block2left[i], &r, &ry, &d); + + cost += r; + distortion += d; + tot_rate_y += ry; + mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode; + } + + *Rate = cost; + *rate_y += tot_rate_y; + *Distortion = distortion; + + return RDCOST(mb->rdmult, mb->rddiv, cost, distortion); +} + +int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion) +{ + + MB_PREDICTION_MODE mode; + MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + int rate, ratey; + unsigned int distortion; + int best_rd = INT_MAX; + + //Y Search for 16x16 intra prediction mode + for (mode = DC_PRED; mode <= TM_PRED; mode++) + { + int this_rd; + int dummy; + rate = 0; + + x->e_mbd.mbmi.mode = mode; + + rate += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode]; + + vp8_encode_intra16x16mbyrd(IF_RTCD(&cpi->rtcd), x); + + ratey = vp8_rdcost_mby(x); + + rate += ratey; + + VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride, &distortion, &dummy); + + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) + { + mode_selected = mode; + best_rd = this_rd; + *Rate = rate; + *rate_y = ratey; + *Distortion = (int)distortion; + } + } + + x->e_mbd.mbmi.mode = mode_selected; + return best_rd; +} + + +static int rd_cost_mbuv(MACROBLOCK *mb) +{ + TEMP_CONTEXT t, t2; + int b; + int cost = 0; + MACROBLOCKD *x = &mb->e_mbd; + + vp8_setup_temp_context(&t, x->above_context[UCONTEXT], x->left_context[UCONTEXT], 2); + vp8_setup_temp_context(&t2, x->above_context[VCONTEXT], x->left_context[VCONTEXT], 2); + + for (b = 16; b < 20; b++) + cost += cost_coeffs(mb, x->block + b, vp8_block2type[b], + t.a + vp8_block2above[b], t.l + vp8_block2left[b]); + + for (b = 20; b < 24; b++) + cost += cost_coeffs(mb, x->block + b, vp8_block2type[b], + t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]); + + return cost; +} + + +unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares +{ + unsigned int sse0, sse1; + int sum0, sum1; + VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0); + VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1); + return (sse0 + sse1); +} + +static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel) +{ + vp8_build_uvmvs(&x->e_mbd, fullpixel); + vp8_encode_inter16x16uvrd(IF_RTCD(&cpi->rtcd), x); + + + *rate = rd_cost_mbuv(x); + *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4; + + return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb); +} + +int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion) +{ + MB_PREDICTION_MODE mode; + MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + int best_rd = INT_MAX; + int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r); + int rate_to; + + for (mode = DC_PRED; mode <= TM_PRED; mode++) + { + int rate; + int distortion; + int this_rd; + + x->e_mbd.mbmi.uv_mode = mode; + vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x); + + rate_to = rd_cost_mbuv(x); + rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.uv_mode]; + + distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x); + + this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb); + + if (this_rd < best_rd) + { + best_rd = this_rd; + d = distortion; + r = rate; + *rate_tokenonly = rate_to; + mode_selected = mode; + } + } + + *rate = r; + *distortion = d; + + x->e_mbd.mbmi.uv_mode = mode_selected; + return best_rd; +} +#endif + +int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]) +{ + vp8_prob p [VP8_MVREFS-1]; + assert(NEARESTMV <= m && m <= SPLITMV); + vp8_mv_ref_probs(p, near_mv_ref_ct); + return vp8_cost_token(vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m); +} + +void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv) +{ + int i; + + x->e_mbd.mbmi.mode = mb; + x->e_mbd.mbmi.mv.as_mv.row = mv->row; + x->e_mbd.mbmi.mv.as_mv.col = mv->col; + + for (i = 0; i < 16; i++) + { + B_MODE_INFO *bmi = &x->e_mbd.block[i].bmi; + bmi->mode = (B_PREDICTION_MODE) mb; + bmi->mv.as_mv.row = mv->row; + bmi->mv.as_mv.col = mv->col; + } +} + +#if !(CONFIG_REALTIME_ONLY) +int vp8_count_labels(int const *labelings) +{ + int i; + int count = 0; + + for (i = 0; i < 16; i++) + { + if (labelings[i] > count) + count = labelings[i]; + } + + return count + 1; +} + + +static int labels2mode( + MACROBLOCK *x, + int const *labelings, int which_label, + B_PREDICTION_MODE this_mode, + MV *this_mv, MV *best_ref_mv, + int *mvcost[2] +) +{ + MACROBLOCKD *const xd = & x->e_mbd; + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + + int cost = 0; + int thismvcost = 0; + + /* We have to be careful retrieving previously-encoded motion vectors. + Ones from this macroblock have to be pulled from the BLOCKD array + as they have not yet made it to the bmi array in our MB_MODE_INFO. */ + + int i = 0; + + do + { + BLOCKD *const d = xd->block + i; + const int row = i >> 2, col = i & 3; + + B_PREDICTION_MODE m; + + if (labelings[i] != which_label) + continue; + + if (col && labelings[i] == labelings[i-1]) + m = LEFT4X4; + else if (row && labelings[i] == labelings[i-4]) + m = ABOVE4X4; + else + { + // the only time we should do costing for new motion vector or mode + // is when we are on a new label (jbb May 08, 2007) + switch (m = this_mode) + { + case NEW4X4 : + thismvcost = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102); + break; + case LEFT4X4: + *this_mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv; + break; + case ABOVE4X4: + *this_mv = row ? d[-4].bmi.mv.as_mv : vp8_above_bmi(mic, i, mis)->mv.as_mv; + break; + case ZERO4X4: + this_mv->row = this_mv->col = 0; + break; + default: + break; + } + + if (m == ABOVE4X4) // replace above with left if same + { + const MV mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv; + + if (mv.row == this_mv->row && mv.col == this_mv->col) + m = LEFT4X4; + } + + cost = x->inter_bmode_costs[ m]; + } + + d->bmi.mode = m; + d->bmi.mv.as_mv = *this_mv; + + } + while (++i < 16); + + cost += thismvcost ; + return cost; +} + +static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels, int which_label, TEMP_CONTEXT *t) +{ + int cost = 0; + int b; + MACROBLOCKD *x = &mb->e_mbd; + + + for (b = 0; b < 16; b++) + if (labels[ b] == which_label) + cost += cost_coeffs(mb, x->block + b, 3, + t->a + vp8_block2above[b], + t->l + vp8_block2left[b]); + + return cost; + +} +static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label, const vp8_encodemb_rtcd_vtable_t *rtcd) +{ + int i; + unsigned int distortion = 0; + + for (i = 0; i < 16; i++) + { + if (labels[i] == which_label) + { + BLOCKD *bd = &x->e_mbd.block[i]; + BLOCK *be = &x->block[i]; + + + vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict); + ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16); + x->short_fdct4x4rd(be->src_diff, be->coeff, 32); + + // set to 0 no way to account for 2nd order DC so discount + //be->coeff[0] = 0; + x->quantize_brd(be, bd); + + distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff); + } + } + + return distortion; +} + +static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp8_encodemb_rtcd_vtable_t *rtcd) +{ + int b; + MACROBLOCKD *const x = &mb->e_mbd; + BLOCK *const mb_y2 = mb->block + 24; + BLOCKD *const x_y2 = x->block + 24; + short *Y2DCPtr = mb_y2->src_diff; + BLOCK *beptr; + int d; + + ENCODEMB_INVOKE(rtcd, submby)(mb->src_diff, mb->src.y_buffer, mb->e_mbd.predictor, mb->src.y_stride); + + // Fdct and building the 2nd order block + for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) + { + mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32); + *Y2DCPtr++ = beptr->coeff[0]; + *Y2DCPtr++ = beptr->coeff[16]; + } + + // 2nd order fdct + if (x->mbmi.mode != SPLITMV) + { + mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8); + } + + // Quantization + for (b = 0; b < 16; b++) + { + mb->quantize_brd(&mb->block[b], &mb->e_mbd.block[b]); + } + + // DC predication and Quantization of 2nd Order block + if (x->mbmi.mode != SPLITMV) + { + + { + mb->quantize_brd(mb_y2, x_y2); + } + } + + // Distortion + if (x->mbmi.mode == SPLITMV) + d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 0) << 2; + else + { + d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2; + d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff); + } + + *Distortion = (d >> 4); + + // rate + *Rate = vp8_rdcost_mby(mb); +} + +static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *mdcounts, int *returntotrate, int *returnyrate, int *returndistortion, int compressor_speed, int *mvcost[2], int mvthresh, int fullpixel) +{ + int i, segmentation; + B_PREDICTION_MODE this_mode; + MACROBLOCKD *xc = &x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + BLOCK *c = &x->block[0]; + BLOCKD *e = &x->e_mbd.block[0]; + int const *labels; + int best_segment_rd = INT_MAX; + int best_seg = 0; + int br = 0; + int bd = 0; + int bsr = 0; + int bsd = 0; + int bestsegmentyrate = 0; + + // FIX TO Rd error outrange bug PGW 9 june 2004 + B_PREDICTION_MODE bmodes[16] = {ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4, + ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4, + ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4, + ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4 + }; + + MV bmvs[16]; + int beobs[16]; + + for (segmentation = 0; segmentation < VP8_NUMMBSPLITS; segmentation++) + { + int label_count; + int this_segment_rd = 0; + int label_mv_thresh; + int rate = 0; + int sbr = 0; + int sbd = 0; + int UNINITIALIZED_IS_SAFE(sseshift); + int segmentyrate = 0; + + vp8_variance_fn_ptr_t v_fn_ptr; + + TEMP_CONTEXT t; + TEMP_CONTEXT tb; + vp8_setup_temp_context(&t, xc->above_context[Y1CONTEXT], xc->left_context[Y1CONTEXT], 4); + + br = 0; + bd = 0; + + switch (segmentation) + { + case 0: + v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8); + v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8); + v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8); + v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3); + v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d); + sseshift = 3; + break; + case 1: + v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16); + v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16); + v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16); + v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3); + v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d); + sseshift = 3; + break; + case 2: + v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8); + v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8); + v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8); + v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3); + v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d); + sseshift = 2; + break; + case 3: + v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4); + v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4); + v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4); + v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3); + v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d); + sseshift = 0; + break; + } + + labels = vp8_mbsplits[segmentation]; + label_count = vp8_count_labels(labels); + + // 64 makes this threshold really big effectively + // making it so that we very rarely check mvs on + // segments. setting this to 1 would make mv thresh + // roughly equal to what it is for macroblocks + label_mv_thresh = 1 * mvthresh / label_count ; + + // Segmentation method overheads + rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation); + + rate += vp8_cost_mv_ref(SPLITMV, mdcounts); + + this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb); + br += rate; + + for (i = 0; i < label_count; i++) + { + MV mode_mv[B_MODE_COUNT]; + int best_label_rd = INT_MAX; + B_PREDICTION_MODE mode_selected = ZERO4X4; + int j; + int bestlabelyrate = 0; + + b = &x->block[0]; + d = &x->e_mbd.block[0]; + + + // find first label + for (j = 0; j < 16; j++) + if (labels[j] == i) + break; + + c = &x->block[j]; + e = &x->e_mbd.block[j]; + + // search for the best motion vector on this segment + for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++) + { + int distortion; + int this_rd; + int num00; + int labelyrate; + + TEMP_CONTEXT ts; + vp8_setup_temp_context(&ts, &t.a[0], &t.l[0], 4); + + if (this_mode == NEW4X4) + { + int step_param = 0; + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + int n; + int thissme; + int bestsme = INT_MAX; + MV temp_mv; + + // Is the best so far sufficiently good that we cant justify doing and new motion search. + if (best_label_rd < label_mv_thresh) + break; + + { + int sadpb = x->sadperbit4; + + if (cpi->sf.search_method == HEX) + bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr.vf, v_fn_ptr.sdf, x->mvsadcost, mvcost); + else + { + bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost); + + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost); + + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEW4X4].row = temp_mv.row; + mode_mv[NEW4X4].col = temp_mv.col; + } + } + } + } + + // Should we do a full search (best quality only) + if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000) + { + thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, &v_fn_ptr, x->mvcost, x->mvsadcost); + + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEW4X4] = e->bmi.mv.as_mv; + } + else + { + // The full search result is actually worse so re-instate the previous best vector + e->bmi.mv.as_mv = mode_mv[NEW4X4]; + } + } + } + + if (bestsme < INT_MAX) + { + if (!fullpixel) + cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr.svf, v_fn_ptr.vf, mvcost); + else + vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr.svf, v_fn_ptr.vf, mvcost); + } + } + + rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode], best_ref_mv, mvcost); + + // Trap vectors that reach beyond the UMV borders + if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max)) + { + continue; + } + + distortion = vp8_encode_inter_mb_segment(x, labels, i, IF_RTCD(&cpi->rtcd.encodemb)) / 4; + + labelyrate = rdcost_mbsegment_y(x, labels, i, &ts); + rate += labelyrate; + + this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb); + + if (this_rd < best_label_rd) + { + sbr = rate; + sbd = distortion; + bestlabelyrate = labelyrate; + mode_selected = this_mode; + best_label_rd = this_rd; + vp8_setup_temp_context(&tb, &ts.a[0], &ts.l[0], 4); + + } + } + + vp8_setup_temp_context(&t, &tb.a[0], &tb.l[0], 4); + + labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], best_ref_mv, mvcost); + + br += sbr; + bd += sbd; + segmentyrate += bestlabelyrate; + this_segment_rd += best_label_rd; + + if ((this_segment_rd > best_rd) || (this_segment_rd > best_segment_rd)) + break; + } + + if ((this_segment_rd <= best_rd) && (this_segment_rd < best_segment_rd)) + { + bsr = br; + bsd = bd; + bestsegmentyrate = segmentyrate; + best_segment_rd = this_segment_rd; + best_seg = segmentation; + + // store everything needed to come back to this!! + for (i = 0; i < 16; i++) + { + BLOCKD *bd = &x->e_mbd.block[i]; + + bmvs[i] = bd->bmi.mv.as_mv; + bmodes[i] = bd->bmi.mode; + beobs[i] = bd->eob; + } + } + } + + // set it to the best + for (i = 0; i < 16; i++) + { + BLOCKD *bd = &x->e_mbd.block[i]; + + bd->bmi.mv.as_mv = bmvs[i]; + bd->bmi.mode = bmodes[i]; + bd->eob = beobs[i]; + } + + // Trap cases where the best split mode has all vectors coded 0,0 (or all the same) + if (FALSE) + { + int allsame = 1; + + for (i = 1; i < 16; i++) + { + if ((bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row)) + { + allsame = 0; + break; + } + } + + if (allsame) + { + best_segment_rd = INT_MAX; + } + } + + *returntotrate = bsr; + *returndistortion = bsd; + *returnyrate = bestsegmentyrate; + + + + // save partitions + labels = vp8_mbsplits[best_seg]; + x->e_mbd.mbmi.partitioning = best_seg; + x->e_mbd.mbmi.partition_count = vp8_count_labels(labels); + + for (i = 0; i < x->e_mbd.mbmi.partition_count; i++) + { + int j; + + for (j = 0; j < 16; j++) + { + if (labels[j] == i) + break; + } + + x->e_mbd.mbmi.partition_bmi[i].mode = x->e_mbd.block[j].bmi.mode; + x->e_mbd.mbmi.partition_bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv; + } + + return best_segment_rd; +} + + +int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra) +{ + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + B_MODE_INFO best_bmodes[16]; + MB_MODE_INFO best_mbmode; + MV best_ref_mv; + MV mode_mv[MB_MODE_COUNT]; + MB_PREDICTION_MODE this_mode; + int num00; + int best_mode_index = 0; + + int i; + int mode_index; + int mdcounts[4]; + int rate; + int distortion; + int best_rd = INT_MAX; // 1 << 30; + int ref_frame_cost[MAX_REF_FRAMES]; + int rate2, distortion2; + int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly; + int rate_y, UNINITIALIZED_IS_SAFE(rate_uv); + + //int all_rds[MAX_MODES]; // Experimental debug code. + //int all_rates[MAX_MODES]; + //int all_dist[MAX_MODES]; + //int intermodecost[MAX_MODES]; + + MB_PREDICTION_MODE uv_intra_mode; + int sse; + int sum; + int uvintra_eob = 0; + int tteob = 0; + int force_no_skip = 0; + + *returnintra = INT_MAX; + + cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame + + x->skip = 0; + + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded); + + // Experimental code + // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb + //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) ) + /*{ + int tmprdmult; + + //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs); + tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb; + //tmprdmult = tmprdmult; + + //if ( tmprdmult > cpi->RDMULT * 2 ) + // tmprdmult = cpi->RDMULT * 2; + //else if ( tmprdmult < cpi->RDMULT / 2 ) + // tmprdmult = cpi->RDMULT / 2; + + //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult; + + //x->rdmult = tmprdmult; + + }*/ + + // Special case treatment when GF and ARF are not sensible options for reference + if (cpi->ref_frame_flags == VP8_LAST_FLAG) + { + ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(255); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(255) + + vp8_cost_zero(128); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(255) + + vp8_cost_one(128); + } + else + { + ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(cpi->prob_last_coded); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_zero(cpi->prob_gf_coded); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(cpi->prob_last_coded) + + vp8_cost_one(cpi->prob_gf_coded); + } + + vpx_memset(mode_mv, 0, sizeof(mode_mv)); + + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion); + uv_intra_mode = x->e_mbd.mbmi.uv_mode; + { + uvintra_eob = 0; + + for (i = 16; i < 24; i++) + uvintra_eob += x->e_mbd.block[i].eob; + } + + for (mode_index = 0; mode_index < MAX_MODES; mode_index++) + { + int frame_cost; + int this_rd = INT_MAX; + int lf_or_gf = 0; // Lat Frame (01) or gf/arf (1) + int disable_skip = 0; + + force_no_skip = 0; + + // Experimental debug code. + // Record of rd values recorded for this MB. -1 indicates not measured + //all_rds[mode_index] = -1; + //all_rates[mode_index] = -1; + //all_dist[mode_index] = -1; + //intermodecost[mode_index] = -1; + + // Test best rd so far against threshold for trying this mode. + if (best_rd <= cpi->rd_threshes[mode_index]) + continue; + + + + // These variables hold are rolling total cost and distortion for this mode + rate2 = 0; + distortion2 = 0; + + // Where skip is allowable add in the default per mb cost for the no skip case. + // where we then decide to skip we have to delete this and replace it with the + // cost of signallying a skip + if (cpi->common.mb_no_coeff_skip) + { + rate2 += vp8_cost_bit(cpi->prob_skip_false, 0); + } + + this_mode = vp8_mode_order[mode_index]; + + x->e_mbd.mbmi.mode = this_mode; + x->e_mbd.mbmi.uv_mode = DC_PRED; + x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index]; + + //Only consider ZEROMV/ALTREF_FRAME for alt ref frame. + if (cpi->is_src_frame_alt_ref) + { + if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME) + continue; + } + + if (x->e_mbd.mbmi.ref_frame == LAST_FRAME) + { + if (!(cpi->ref_frame_flags & VP8_LAST_FLAG)) + continue; + + lf_or_gf = 0; // Local last frame vs Golden frame flag + + // Set up pointers for this macro block into the previous frame recon buffer + x->e_mbd.pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset; + x->e_mbd.pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset; + x->e_mbd.pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset; + } + else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME) + { + + // not supposed to reference gold frame + if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG)) + continue; + + lf_or_gf = 1; // Local last frame vs Golden frame flag + + // Set up pointers for this macro block into the previous frame recon buffer + x->e_mbd.pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset; + x->e_mbd.pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset; + x->e_mbd.pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset; + } + else if (x->e_mbd.mbmi.ref_frame == ALTREF_FRAME) + { + // not supposed to reference alt ref frame + if (!(cpi->ref_frame_flags & VP8_ALT_FLAG)) + continue; + + //if ( !cpi->source_alt_ref_active ) + // continue; + + lf_or_gf = 1; // Local last frame vs Golden frame flag + + // Set up pointers for this macro block into the previous frame recon buffer + x->e_mbd.pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset; + x->e_mbd.pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset; + x->e_mbd.pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset; + } + + vp8_find_near_mvs(&x->e_mbd, + x->e_mbd.mode_info_context, + &mode_mv[NEARESTMV], &mode_mv[NEARMV], &best_ref_mv, + mdcounts, x->e_mbd.mbmi.ref_frame, cpi->common.ref_frame_sign_bias); + + + // Estimate the reference frame signaling cost and add it to the rolling cost variable. + frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame]; + rate2 += frame_cost; + + if (this_mode <= B_PRED) + { + for (i = 0; i < 16; i++) + { + vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO)); + } + } + + // Check to see if the testing frequency for this mode is at its max + // If so then prevent it from being tested and increase the threshold for its testing + if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1)) + { + if (cpi->mbs_tested_so_far <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]) + { + // Increase the threshold for coding this mode to make it less likely to be chosen + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + + continue; + } + } + + // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested + cpi->mode_test_hit_counts[mode_index] ++; + + // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise + if (cpi->zbin_mode_boost_enabled) + { + if ((vp8_mode_order[mode_index] == ZEROMV) && (vp8_ref_frame_order[mode_index] != LAST_FRAME)) + cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; + else + cpi->zbin_mode_boost = 0; + + vp8cx_mb_init_quantizer(cpi, x); + } + + switch (this_mode) + { + case B_PRED: + + // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED]; + vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion); + rate2 += rate; + //rate_y = rate; + distortion2 += distortion; + rate2 += uv_intra_rate; + rate_uv = uv_intra_rate_tokenonly; + distortion2 += uv_intra_distortion; + break; + + case SPLITMV: + { + int frame_cost_rd = RDFUNC(x->rdmult, x->rddiv, frame_cost, 0, cpi->target_bits_per_mb); + int saved_rate = rate2; + + // vp8_rd_pick_best_mbsegmentation looks only at Y and does not account for frame_cost. + // (best_rd - frame_cost_rd) is thus a conservative breakout number. + int breakout_rd = best_rd - frame_cost_rd; + int tmp_rd; + + if (x->e_mbd.mbmi.ref_frame == LAST_FRAME) + tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWMV], cpi->common.full_pixel) ; + else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME) + tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWG], cpi->common.full_pixel) ; + else + tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWA], cpi->common.full_pixel) ; + + rate2 += rate; + distortion2 += distortion; + + // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV + if (tmp_rd < breakout_rd) + { + // Now work out UV cost and add it in + vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel); + rate2 += rate; + rate_uv = rate; + distortion2 += distortion; + + } + else + { + this_rd = INT_MAX; + disable_skip = 1; + } + + // Trap cases where the best split mode has all vectors coded 0,0 (or all the same) + if (0) + { + int allsame = 1; + + for (i = 1; i < 16; i++) + { + BLOCKD *bd = &x->e_mbd.block[i]; + + if (bd->bmi.mv.as_int != x->e_mbd.block[0].bmi.mv.as_int) //(bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row ) ) + { + allsame = 0; + break; + } + } + + if (allsame) + { + // reset mode and mv and jump to newmv + this_mode = NEWMV; + distortion2 = 0; + rate2 = saved_rate; + mode_mv[NEWMV].row = x->e_mbd.block[0].bmi.mv.as_mv.row; + mode_mv[NEWMV].col = x->e_mbd.block[0].bmi.mv.as_mv.col; + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96); + goto mv_selected; + } + } + + // trap cases where the 8x8s can be promoted to 8x16s or 16x8s + if (0)//x->e_mbd.mbmi.partition_count == 4) + { + + if (x->e_mbd.mbmi.partition_bmi[0].mv.as_int == x->e_mbd.mbmi.partition_bmi[1].mv.as_int + && x->e_mbd.mbmi.partition_bmi[2].mv.as_int == x->e_mbd.mbmi.partition_bmi[3].mv.as_int) + { + const int *labels = vp8_mbsplits[2]; + x->e_mbd.mbmi.partitioning = 0; + rate -= vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + 2); + rate += vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings); + //rate -= x->inter_bmode_costs[ x->e_mbd.mbmi.partition_bmi[1]]; + //rate -= x->inter_bmode_costs[ x->e_mbd.mbmi.partition_bmi[3]]; + x->e_mbd.mbmi.partition_bmi[1] = x->e_mbd.mbmi.partition_bmi[2]; + } + } + + } + break; + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: + x->e_mbd.mbmi.ref_frame = INTRA_FRAME; + vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + { + macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ; + rate2 += rate; + rate_y = rate; + distortion2 += distortion; + rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode]; + rate2 += uv_intra_rate; + rate_uv = uv_intra_rate_tokenonly; + distortion2 += uv_intra_distortion; + } + break; + + case NEWMV: + + // Decrement full search counter + if (cpi->check_freq[lf_or_gf] > 0) + cpi->check_freq[lf_or_gf] --; + + { + int thissme; + int bestsme = INT_MAX; + int step_param = cpi->sf.first_step; + int search_range; + int further_steps; + int n; + + // Work out how long a search we should do + search_range = MAXF(abs(best_ref_mv.col), abs(best_ref_mv.row)) >> 3; + + if (search_range >= x->vector_range) + x->vector_range = search_range; + else if (x->vector_range > cpi->sf.min_fs_radius) + x->vector_range--; + + // Initial step/diamond search + { + int sadpb = x->sadperbit16; + + if (cpi->sf.search_method == HEX) + { + bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost); + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + } + else + { + bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9 + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + + // Further step/diamond searches as necessary + n = 0; + further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9 + + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + } + else + { + d->bmi.mv.as_mv.row = mode_mv[NEWMV].row; + d->bmi.mv.as_mv.col = mode_mv[NEWMV].col; + } + } + } + } + + } + + // Should we do a full search + if (!cpi->check_freq[lf_or_gf] || cpi->do_full[lf_or_gf]) + { + int thissme; + int full_flag_thresh = 0; + + // Update x->vector_range based on best vector found in step search + search_range = MAXF(abs(d->bmi.mv.as_mv.row), abs(d->bmi.mv.as_mv.col)); + + if (search_range > x->vector_range) + x->vector_range = search_range; + else + search_range = x->vector_range; + + // Apply limits + search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range; + { + int sadpb = x->sadperbit16 >> 2; + thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr, x->mvcost, x->mvsadcost); + } + + // Barrier threshold to initiating full search + // full_flag_thresh = 10 + (thissme >> 7); + if ((thissme + full_flag_thresh) < bestsme) + { + cpi->do_full[lf_or_gf] ++; + bestsme = thissme; + } + else if (thissme < bestsme) + bestsme = thissme; + else + { + cpi->do_full[lf_or_gf] = cpi->do_full[lf_or_gf] >> 1; + cpi->check_freq[lf_or_gf] = cpi->sf.full_freq[lf_or_gf]; + + // The full search result is actually worse so re-instate the previous best vector + d->bmi.mv.as_mv.row = mode_mv[NEWMV].row; + d->bmi.mv.as_mv.col = mode_mv[NEWMV].col; + } + } + + if (bestsme < INT_MAX) + // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost); // normal mvc=11 + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, cpi->fn_ptr.svf, cpi->fn_ptr.vf, x->mvcost); + + mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; + mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; + + // Add the new motion vector cost to our rolling cost variable + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96); + + } + + case NEARESTMV: + case NEARMV: + + // Clip "next_nearest" so that it does not extend to far out of image + if (mode_mv[this_mode].col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) + mode_mv[this_mode].col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + else if (mode_mv[this_mode].col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) + mode_mv[this_mode].col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + + if (mode_mv[this_mode].row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) + mode_mv[this_mode].row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + else if (mode_mv[this_mode].row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) + mode_mv[this_mode].row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; + + // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode. + if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && + ((mode_mv[this_mode].row == 0) && (mode_mv[this_mode].col == 0))) + continue; + + case ZEROMV: + + mv_selected: + + // Trap vectors that reach beyond the UMV borders + // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point + // because of the lack of break statements in the previous two cases. + if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max)) + continue; + + vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]); + vp8_build_inter_predictors_mby(&x->e_mbd); + VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum); + + if (cpi->active_map_enabled && x->active_ptr[0] == 0) + { + x->skip = 1; + } + else if (sse < x->encode_breakout) + { + // Check u and v to make sure skip is ok + int sse2 = 0; + + sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance)); + + if (sse2 * 2 < x->encode_breakout) + { + x->skip = 1; + distortion2 = sse; + rate2 = 500; + + disable_skip = 1; // We have no real rate data so trying to adjust for rate_y and rate_uv below will cause problems. + this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb); + + break; // (PGW) Move break here from below - for now at least + } + else + x->skip = 0; + } + + //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts); // Experimental debug code + + // Add in the Mv/mode cost + rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + + // Y cost and distortion + macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb)); + rate2 += rate; + rate_y = rate; + distortion2 += distortion; + + // UV cost and distortion + vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel); + rate2 += rate; + rate_uv = rate; + distortion2 += distortion; + break; + + default: + break; + } + + if (!disable_skip) + { + // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate + if (cpi->common.mb_no_coeff_skip) + { + tteob = 0; + + for (i = 0; i <= 24; i++) + { + tteob += x->e_mbd.block[i].eob; + } + + if (tteob == 0) + { +#if 1 + rate2 -= (rate_y + rate_uv); + + // Back out no skip flag costing and add in skip flag costing + if (cpi->prob_skip_false) + { + rate2 += vp8_cost_bit(cpi->prob_skip_false, 1); + rate2 -= vp8_cost_bit(cpi->prob_skip_false, 0); + } + +#else + int rateuseskip; + int ratenotuseskip; + + + + ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0); + rateuseskip = vp8_cost_bit(cpi->prob_skip_false, 1); + + if (1) // rateuseskip<ratenotuseskip) + { + rate2 -= ratenotuseskip; + rate2 += rateuseskip; + force_no_skip = 0; + } + else + { + force_no_skip = 1; + } + +#endif + } + +#if 0 + else + { + int rateuseskip; + int ratenotuseskip; + int maxdistortion; + int minrate; + int skip_rd; + + // distortion when no coeff is encoded + maxdistortion = macro_block_max_error(x); + + ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0); + rateuseskip = vp8_cost_bit(cpi->prob_skip_false, 1); + + minrate = rateuseskip - ratenotuseskip; + + skip_rd = RDFUNC(x->rdmult, x->rddiv, minrate, maxdistortion - distortion2, cpi->target_bits_per_mb); + + if (skip_rd + 50 < 0 && x->e_mbd.mbmi.ref_frame != INTRA_FRAME && rate_y + rate_uv < 4000) + { + force_no_skip = 1; + rate2 = rate2 + rateuseskip - ratenotuseskip; + distortion2 = maxdistortion; + } + else + { + force_no_skip = 0; + } + + } + +#endif + + } + + // Calculate the final RD estimate for this mode + this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb); + } + + // Experimental debug code. + //all_rds[mode_index] = this_rd; + //all_rates[mode_index] = rate2; + //all_dist[mode_index] = distortion2; + + if ((x->e_mbd.mbmi.ref_frame == INTRA_FRAME) && (this_rd < *returnintra)) + { + *returnintra = this_rd ; + } + + // Did this mode help.. i.i is it the new best mode + if (this_rd < best_rd || x->skip) + { + // Note index of best mode so far + best_mode_index = mode_index; + x->e_mbd.mbmi.force_no_skip = force_no_skip; + + if (this_mode <= B_PRED) + { + x->e_mbd.mbmi.uv_mode = uv_intra_mode; + } + + *returnrate = rate2; + *returndistortion = distortion2; + best_rd = this_rd; + vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO)); + + for (i = 0; i < 16; i++) + { + vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO)); + } + + // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time + cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around. + else + { + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + if (x->skip) + break; + } + + // Reduce the activation RD thresholds for the best choice mode + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) + { + int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2); + + cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; + cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; + + // If we chose a split mode then reset the new MV thresholds as well + /*if ( vp8_mode_order[best_mode_index] == SPLITMV ) + { + best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWMV] >> 4); + cpi->rd_thresh_mult[THR_NEWMV] = (cpi->rd_thresh_mult[THR_NEWMV] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWMV]-best_adjustment: MIN_THRESHMULT; + cpi->rd_threshes[THR_NEWMV] = (cpi->rd_baseline_thresh[THR_NEWMV] >> 7) * cpi->rd_thresh_mult[THR_NEWMV]; + + best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWG] >> 4); + cpi->rd_thresh_mult[THR_NEWG] = (cpi->rd_thresh_mult[THR_NEWG] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWG]-best_adjustment: MIN_THRESHMULT; + cpi->rd_threshes[THR_NEWG] = (cpi->rd_baseline_thresh[THR_NEWG] >> 7) * cpi->rd_thresh_mult[THR_NEWG]; + + best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWA] >> 4); + cpi->rd_thresh_mult[THR_NEWA] = (cpi->rd_thresh_mult[THR_NEWA] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWA]-best_adjustment: MIN_THRESHMULT; + cpi->rd_threshes[THR_NEWA] = (cpi->rd_baseline_thresh[THR_NEWA] >> 7) * cpi->rd_thresh_mult[THR_NEWA]; + }*/ + + } + + // If we have chosen new mv or split then decay the full search check count more quickly. + if ((vp8_mode_order[best_mode_index] == NEWMV) || (vp8_mode_order[best_mode_index] == SPLITMV)) + { + int lf_or_gf = (vp8_ref_frame_order[best_mode_index] == LAST_FRAME) ? 0 : 1; + + if (cpi->check_freq[lf_or_gf] && !cpi->do_full[lf_or_gf]) + { + cpi->check_freq[lf_or_gf] --; + } + } + + // Keep a record of best mode index that we chose + cpi->last_best_mode_index = best_mode_index; + + // Note how often each mode chosen as best + cpi->mode_chosen_counts[best_mode_index] ++; + + + if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) + { + best_mbmode.mode = ZEROMV; + best_mbmode.ref_frame = ALTREF_FRAME; + best_mbmode.mv.as_int = 0; + best_mbmode.uv_mode = 0; + best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; + best_mbmode.partitioning = 0; + best_mbmode.dc_diff = 0; + + vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + for (i = 0; i < 16; i++) + { + vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO)); + } + + x->e_mbd.mbmi.mv.as_int = 0; + + return best_rd; + } + + + // macroblock modes + vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + for (i = 0; i < 16; i++) + { + vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO)); + } + + x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv; + + return best_rd; +} +#endif + diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h new file mode 100644 index 000000000..c6eae4b92 --- /dev/null +++ b/vp8/encoder/rdopt.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_RDOPT_H +#define __INC_RDOPT_H +void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue); +int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion); +int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion); +int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion); +extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); + + +#endif diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c new file mode 100644 index 000000000..74c6bd76a --- /dev/null +++ b/vp8/encoder/sad_c.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <stdlib.h> + +unsigned int vp8_sad16x16_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad) +{ + + int r, c; + unsigned int sad = 0; + + for (r = 0; r < 16; r++) + { + for (c = 0; c < 16; c++) + { + sad += abs(src_ptr[c] - ref_ptr[c]); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sad; +} + + +static __inline +unsigned int sad_mx_n_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int m, + int n) +{ + + int r, c; + unsigned int sad = 0; + + for (r = 0; r < n; r++) + { + for (c = 0; c < m; c++) + { + sad += abs(src_ptr[c] - ref_ptr[c]); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sad; +} + + +unsigned int vp8_sad8x8_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad) +{ + + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); +} + + +unsigned int vp8_sad16x8_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad) +{ + + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); + +} + + +unsigned int vp8_sad8x16_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad) +{ + + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); +} + + +unsigned int vp8_sad4x4_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_sad) +{ + + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); +} + +void vp8_sad16x16x3_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); +} + +void vp8_sad16x8x3_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); +} + +void vp8_sad8x8x3_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); +} + +void vp8_sad8x16x3_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); +} + +void vp8_sad4x4x3_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); +} + +void vp8_sad16x16x4d_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr[], + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); +} + +void vp8_sad16x8x4d_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr[], + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); +} + +void vp8_sad8x8x4d_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr[], + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); +} + +void vp8_sad8x16x4d_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr[], + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); +} + +void vp8_sad4x4x4d_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr[], + int ref_stride, + unsigned int *sad_array +) +{ + sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); +} diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c new file mode 100644 index 000000000..df214a89f --- /dev/null +++ b/vp8/encoder/ssim.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +#include "math.h" + +#define C1 (float)(64 * 64 * 0.01*255*0.01*255) +#define C2 (float)(64 * 64 * 0.03*255*0.03*255) + +static int width_y; +static int height_y; +static int height_uv; +static int width_uv; +static int stride_uv; +static int stride; +static int lumimask; +static int luminance; +static double plane_summed_weights = 0; + +static short img12_sum_block[8*4096*4096*2] ; + +static short img1_sum[8*4096*2]; +static short img2_sum[8*4096*2]; +static int img1_sq_sum[8*4096*2]; +static int img2_sq_sum[8*4096*2]; +static int img12_mul_sum[8*4096*2]; + + +double vp8_similarity +( + int mu_x, + int mu_y, + int pre_mu_x2, + int pre_mu_y2, + int pre_mu_xy2 +) +{ + int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy; + + mu_x2 = mu_x * mu_x; + mu_y2 = mu_y * mu_y; + mu_xy = mu_x * mu_y; + + theta_x2 = 64 * pre_mu_x2 - mu_x2; + theta_y2 = 64 * pre_mu_y2 - mu_y2; + theta_xy = 64 * pre_mu_xy2 - mu_xy; + + return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2)); +} + +double vp8_ssim +( + const unsigned char *img1, + const unsigned char *img2, + int stride_img1, + int stride_img2, + int width, + int height +) +{ + int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp; + + double plane_quality, weight, mean; + + short *img1_sum_ptr1, *img1_sum_ptr2; + short *img2_sum_ptr1, *img2_sum_ptr2; + int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2; + int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2; + int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2; + + plane_quality = 0; + + if (lumimask) + plane_summed_weights = 0.0f; + else + plane_summed_weights = (height - 7) * (width - 7); + + //some prologue for the main loop + temp = 8 * width; + + img1_sum_ptr1 = img1_sum + temp; + img2_sum_ptr1 = img2_sum + temp; + img1_sq_sum_ptr1 = img1_sq_sum + temp; + img2_sq_sum_ptr1 = img2_sq_sum + temp; + img12_mul_sum_ptr1 = img12_mul_sum + temp; + + for (x = 0; x < width; x++) + { + img1_sum[x] = img1[x]; + img2_sum[x] = img2[x]; + img1_sq_sum[x] = img1[x] * img1[x]; + img2_sq_sum[x] = img2[x] * img2[x]; + img12_mul_sum[x] = img1[x] * img2[x]; + + img1_sum_ptr1[x] = 0; + img2_sum_ptr1[x] = 0; + img1_sq_sum_ptr1[x] = 0; + img2_sq_sum_ptr1[x] = 0; + img12_mul_sum_ptr1[x] = 0; + } + + //the main loop + for (y = 1; y < height; y++) + { + img1 += stride_img1; + img2 += stride_img2; + + temp = (y - 1) % 9 * width; + + img1_sum_ptr1 = img1_sum + temp; + img2_sum_ptr1 = img2_sum + temp; + img1_sq_sum_ptr1 = img1_sq_sum + temp; + img2_sq_sum_ptr1 = img2_sq_sum + temp; + img12_mul_sum_ptr1 = img12_mul_sum + temp; + + temp = y % 9 * width; + + img1_sum_ptr2 = img1_sum + temp; + img2_sum_ptr2 = img2_sum + temp; + img1_sq_sum_ptr2 = img1_sq_sum + temp; + img2_sq_sum_ptr2 = img2_sq_sum + temp; + img12_mul_sum_ptr2 = img12_mul_sum + temp; + + for (x = 0; x < width; x++) + { + img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x]; + img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x]; + img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x]; + img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x]; + img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x]; + } + + if (y > 6) + { + //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum + temp = (y + 1) % 9 * width; + + img1_sum_ptr1 = img1_sum + temp; + img2_sum_ptr1 = img2_sum + temp; + img1_sq_sum_ptr1 = img1_sq_sum + temp; + img2_sq_sum_ptr1 = img2_sq_sum + temp; + img12_mul_sum_ptr1 = img12_mul_sum + temp; + + for (x = 0; x < width; x++) + { + img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x]; + img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x]; + img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x]; + img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x]; + img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x]; + } + + //here we calculate the sum over the 8x8 block of pixels + //this is done by sliding a window across the column sums for the last 8 lines + //each time adding the new column sum, and subtracting the one which fell out of the window + img1_block = 0; + img2_block = 0; + img1_sq_block = 0; + img2_sq_block = 0; + img12_mul_block = 0; + + //prologue, and calculation of simularity measure from the first 8 column sums + for (x = 0; x < 8; x++) + { + img1_block += img1_sum_ptr1[x]; + img2_block += img2_sum_ptr1[x]; + img1_sq_block += img1_sq_sum_ptr1[x]; + img2_sq_block += img2_sq_sum_ptr1[x]; + img12_mul_block += img12_mul_sum_ptr1[x]; + } + + if (lumimask) + { + y2 = y - 7; + x2 = 0; + + if (luminance) + { + mean = (img2_block + img1_block) / 128.0f; + + if (!(y2 % 2 || x2 % 2)) + *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; + } + else + { + mean = *(img12_sum_block + y2 * width_uv + x2); + mean += *(img12_sum_block + y2 * width_uv + x2 + 4); + mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); + mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); + + mean /= 512.0f; + } + + weight = mean < 40 ? 0.0f : + (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); + plane_summed_weights += weight; + + plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); + } + else + plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); + + //and for the rest + for (x = 8; x < width; x++) + { + img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8]; + img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8]; + img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8]; + img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8]; + img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8]; + + if (lumimask) + { + y2 = y - 7; + x2 = x - 7; + + if (luminance) + { + mean = (img2_block + img1_block) / 128.0f; + + if (!(y2 % 2 || x2 % 2)) + *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; + } + else + { + mean = *(img12_sum_block + y2 * width_uv + x2); + mean += *(img12_sum_block + y2 * width_uv + x2 + 4); + mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); + mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); + + mean /= 512.0f; + } + + weight = mean < 40 ? 0.0f : + (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); + plane_summed_weights += weight; + + plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); + } + else + plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); + } + } + } + + if (plane_summed_weights == 0) + return 1.0f; + else + return plane_quality / plane_summed_weights; +} + +double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight +) +{ + double a, b, c; + double frame_weight; + double ssimv; + + width_y = source->y_width; + height_y = source->y_height; + height_uv = source->uv_height; + width_uv = source->uv_width; + stride_uv = dest->uv_stride; + stride = dest->y_stride; + + lumimask = lumamask; + + luminance = 1; + a = vp8_ssim(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, source->y_height); + luminance = 0; + + frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7)); + + if (frame_weight == 0) + a = b = c = 1.0f; + else + { + b = vp8_ssim(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); + + c = vp8_ssim(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); + } + + ssimv = a * .8 + .1 * (b + c); + + *weight = frame_weight; + + return ssimv; +} + +// Google version of SSIM +// SSIM +#define KERNEL 3 +#define KERNEL_SIZE (2 * KERNEL + 1) + +typedef unsigned char uint8; +typedef unsigned int uint32; + +static const int K[KERNEL_SIZE] = +{ + 1, 4, 11, 16, 11, 4, 1 // 16 * exp(-0.3 * i * i) +}; +static const double ki_w = 1. / 2304.; // 1 / sum(i:0..6, j..6) K[i]*K[j] +double get_ssimg(const uint8 *org, const uint8 *rec, + int xo, int yo, int W, int H, + const int stride1, const int stride2 + ) +{ + // TODO(skal): use summed tables + int y, x; + + const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL; + const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL; + const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL; + const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL; + // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1) + // with a diff of 255, squares. That would a max error of 0x8ee0900, + // which fits into 32 bits integers. + uint32 w = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; + org += ymin * stride1; + rec += ymin * stride2; + + for (y = ymin; y <= ymax; ++y, org += stride1, rec += stride2) + { + const int Wy = K[KERNEL + y - yo]; + + for (x = xmin; x <= xmax; ++x) + { + const int Wxy = Wy * K[KERNEL + x - xo]; + // TODO(skal): inlined assembly + w += Wxy; + xm += Wxy * org[x]; + ym += Wxy * rec[x]; + xxm += Wxy * org[x] * org[x]; + xym += Wxy * org[x] * rec[x]; + yym += Wxy * rec[x] * rec[x]; + } + } + + { + const double iw = 1. / w; + const double iwx = xm * iw; + const double iwy = ym * iw; + double sxx = xxm * iw - iwx * iwx; + double syy = yym * iw - iwy * iwy; + + // small errors are possible, due to rounding. Clamp to zero. + if (sxx < 0.) sxx = 0.; + + if (syy < 0.) syy = 0.; + + { + const double sxsy = sqrt(sxx * syy); + const double sxy = xym * iw - iwx * iwy; + static const double C11 = (0.01 * 0.01) * (255 * 255); + static const double C22 = (0.03 * 0.03) * (255 * 255); + static const double C33 = (0.015 * 0.015) * (255 * 255); + const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); + const double c = (2. * sxsy + C22) / (sxx + syy + C22); + + const double s = (sxy + C33) / (sxsy + C33); + return l * c * s; + + } + } + +} + +double get_ssimfull_kernelg(const uint8 *org, const uint8 *rec, + int xo, int yo, int W, int H, + const int stride1, const int stride2) +{ + // TODO(skal): use summed tables + // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1) + // with a diff of 255, squares. That would a max error of 0x8ee0900, + // which fits into 32 bits integers. + int y_, x_; + uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; + org += (yo - KERNEL) * stride1; + org += (xo - KERNEL); + rec += (yo - KERNEL) * stride2; + rec += (xo - KERNEL); + + for (y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride1, rec += stride2) + { + const int Wy = K[y_]; + + for (x_ = 0; x_ < KERNEL_SIZE; ++x_) + { + const int Wxy = Wy * K[x_]; + // TODO(skal): inlined assembly + const int org_x = org[x_]; + const int rec_x = rec[x_]; + xm += Wxy * org_x; + ym += Wxy * rec_x; + xxm += Wxy * org_x * org_x; + xym += Wxy * org_x * rec_x; + yym += Wxy * rec_x * rec_x; + } + } + + { + const double iw = ki_w; + const double iwx = xm * iw; + const double iwy = ym * iw; + double sxx = xxm * iw - iwx * iwx; + double syy = yym * iw - iwy * iwy; + + // small errors are possible, due to rounding. Clamp to zero. + if (sxx < 0.) sxx = 0.; + + if (syy < 0.) syy = 0.; + + { + const double sxsy = sqrt(sxx * syy); + const double sxy = xym * iw - iwx * iwy; + static const double C11 = (0.01 * 0.01) * (255 * 255); + static const double C22 = (0.03 * 0.03) * (255 * 255); + static const double C33 = (0.015 * 0.015) * (255 * 255); + const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); + const double c = (2. * sxsy + C22) / (sxx + syy + C22); + const double s = (sxy + C33) / (sxsy + C33); + return l * c * s; + } + } +} + +double calc_ssimg(const uint8 *org, const uint8 *rec, + const int image_width, const int image_height, + const int stride1, const int stride2 + ) +{ + int j, i; + double SSIM = 0.; + + for (j = 0; j < KERNEL; ++j) + { + for (i = 0; i < image_width; ++i) + { + SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); + } + } + + for (j = KERNEL; j < image_height - KERNEL; ++j) + { + for (i = 0; i < KERNEL; ++i) + { + SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); + } + + for (i = KERNEL; i < image_width - KERNEL; ++i) + { + SSIM += get_ssimfull_kernelg(org, rec, i, j, + image_width, image_height, stride1, stride2); + } + + for (i = image_width - KERNEL; i < image_width; ++i) + { + SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); + } + } + + for (j = image_height - KERNEL; j < image_height; ++j) + { + for (i = 0; i < image_width; ++i) + { + SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); + } + } + + return SSIM; +} + + +double vp8_calc_ssimg +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + double *ssim_y, + double *ssim_u, + double *ssim_v +) +{ + double ssim_all = 0; + int ysize = source->y_width * source->y_height; + int uvsize = ysize / 4; + + *ssim_y = calc_ssimg(source->y_buffer, dest->y_buffer, + source->y_width, source->y_height, + source->y_stride, dest->y_stride); + + + *ssim_u = calc_ssimg(source->u_buffer, dest->u_buffer, + source->uv_width, source->uv_height, + source->uv_stride, dest->uv_stride); + + + *ssim_v = calc_ssimg(source->v_buffer, dest->v_buffer, + source->uv_width, source->uv_height, + source->uv_stride, dest->uv_stride); + + ssim_all = (*ssim_y + *ssim_u + *ssim_v) / (ysize + uvsize + uvsize); + *ssim_y /= ysize; + *ssim_u /= uvsize; + *ssim_v /= uvsize; + return ssim_all; +} diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c new file mode 100644 index 000000000..33ddd64e7 --- /dev/null +++ b/vp8/encoder/tokenize.c @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <math.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "onyx_int.h" +#include "tokenize.h" +#include "vpx_mem/vpx_mem.h" + +/* Global event counters used for accumulating statistics across several + compressions, then generating context.c = initial stats. */ + +#ifdef ENTROPY_STATS +_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]; +#endif +void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ; +void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x); + +TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2]; +TOKENEXTRA *vp8_dct_value_tokens_ptr; +int vp8_dct_value_cost[DCT_MAX_VALUE*2]; +int *vp8_dct_value_cost_ptr; +#if 0 +int skip_true_count = 0; +int skip_false_count = 0; +#endif +static void fill_value_tokens() +{ + + TOKENEXTRA *const t = vp8_dct_value_tokens + DCT_MAX_VALUE; + vp8_extra_bit_struct *const e = vp8_extra_bits; + + int i = -DCT_MAX_VALUE; + int sign = 1; + + do + { + if (!i) + sign = 0; + + { + const int a = sign ? -i : i; + int eb = sign; + + if (a > 4) + { + int j = 4; + + while (++j < 11 && e[j].base_val <= a) {} + + t[i].Token = --j; + eb |= (a - e[j].base_val) << 1; + } + else + t[i].Token = a; + + t[i].Extra = eb; + } + + // initialize the cost for extra bits for all possible coefficient value. + { + int cost = 0; + vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token; + + if (p->base_val) + { + const int extra = t[i].Extra; + const int Length = p->Len; + + if (Length) + cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length); + + cost += vp8_cost_bit(vp8_prob_half, extra & 1); /* sign */ + vp8_dct_value_cost[i + DCT_MAX_VALUE] = cost; + } + + } + + } + while (++i < DCT_MAX_VALUE); + + vp8_dct_value_tokens_ptr = vp8_dct_value_tokens + DCT_MAX_VALUE; + vp8_dct_value_cost_ptr = vp8_dct_value_cost + DCT_MAX_VALUE; +} + +static void tokenize2nd_order_b +( + const BLOCKD *const b, + TOKENEXTRA **tp, + const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + const FRAME_TYPE frametype, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + VP8_COMP *cpi +) +{ + int pt; /* near block/prev token context index */ + int c = 0; /* start at DC */ + const int eob = b->eob; /* one beyond last nonzero coeff */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + int x; + const short *qcoeff_ptr = b->qcoeff; + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + do + { + const int band = vp8_coef_bands[c]; + + if (c < eob) + { + int rc = vp8_default_zig_zag1d[c]; + const int v = qcoeff_ptr[rc]; + + assert(-DCT_MAX_VALUE <= v && v < (DCT_MAX_VALUE)); + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + x = vp8_dct_value_tokens_ptr[v].Token; + } + else + x = DCT_EOB_TOKEN; + + t->Token = x; + t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt]; + + t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0); + + t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0)); + + ++cpi->coef_counts [type] [band] [pt] [x]; + } + while (pt = vp8_prev_token_class[x], ++t, c < eob && ++c < 16); + + *tp = t; + pt = (c != !type); /* 0 <-> all coeff data is zero */ + *a = *l = pt; + +} + +static void tokenize1st_order_b +( + const BLOCKD *const b, + TOKENEXTRA **tp, + const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + const FRAME_TYPE frametype, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + VP8_COMP *cpi +) +{ + int pt; /* near block/prev token context index */ + int c = type ? 0 : 1; /* start at DC unless type 0 */ + const int eob = b->eob; /* one beyond last nonzero coeff */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + int x; + const short *qcoeff_ptr = b->qcoeff; + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + do + { + const int band = vp8_coef_bands[c]; + + x = DCT_EOB_TOKEN; + + if (c < eob) + { + int rc = vp8_default_zig_zag1d[c]; + const int v = qcoeff_ptr[rc]; + + assert(-DCT_MAX_VALUE <= v && v < (DCT_MAX_VALUE)); + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + x = vp8_dct_value_tokens_ptr[v].Token; + } + + t->Token = x; + t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt]; + + t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0); + t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0)); + + ++cpi->coef_counts [type] [band] [pt] [x]; + } + while (pt = vp8_prev_token_class[x], ++t, c < eob && ++c < 16); + + *tp = t; + pt = (c != !type); /* 0 <-> all coeff data is zero */ + *a = *l = pt; + +} +#if 0 +void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) +{ + //int i; + ENTROPY_CONTEXT **const A = x->above_context; + ENTROPY_CONTEXT(* const L)[4] = x->left_context; + int plane_type; + int b; + + TOKENEXTRA *start = *t; + TOKENEXTRA *tp = *t; + + x->mbmi.dc_diff = 1; + + vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts)); + + if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV) + { + plane_type = 3; + } + else + { + tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type, + A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi); + plane_type = 0; + + } + + for (b = 0; b < 16; b++) + tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + + for (b = 16; b < 24; b++) + tokenize1st_order_b(x->block + b, t, 2, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + + if (cpi->common.mb_no_coeff_skip) + { + x->mbmi.mb_skip_coeff = 1; + + while ((tp != *t) && x->mbmi.mb_skip_coeff) + { + x->mbmi.mb_skip_coeff = (x->mbmi.mb_skip_coeff && (tp->Token == DCT_EOB_TOKEN)); + tp ++; + } + + if (x->mbmi.mb_skip_coeff == 1) + { + x->mbmi.dc_diff = 0; + //redo the coutnts + vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts)); + + *t = start; + cpi->skip_true_count++; + + //skip_true_count++; + } + else + { + + cpi->skip_false_count++; + //skip_false_count++; + } + } +} +#else +void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) +{ + //int i; + ENTROPY_CONTEXT **const A = x->above_context; + ENTROPY_CONTEXT(* const L)[4] = x->left_context; + int plane_type; + int b; + + TOKENEXTRA *start = *t; + TOKENEXTRA *tp = *t; + + x->mbmi.dc_diff = 1; + +#if 0 + + if (x->mbmi.force_no_skip) + { + x->mbmi.mb_skip_coeff = 1; + //reset for next_mb. + x->mbmi.force_no_skip = 0; + } + +#endif + +#if 1 + + if (x->mbmi.mb_skip_coeff) + { + + cpi->skip_true_count++; + + if (!cpi->common.mb_no_coeff_skip) + vp8_stuff_mb(cpi, x, t) ; + else + { + vp8_fix_contexts(cpi, x); + } + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + x->mbmi.dc_diff = 0; + else + x->mbmi.dc_diff = 1; + + + return; + } + + cpi->skip_false_count++; +#endif +#if 0 + + if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV) + { + int i, skip = 1; + + for (i = 0; i < 24; i++) + skip &= (!x->block[i].eob); + + if (skip != x->mbmi.mb_skip_coeff) + skip += 0; + + x->mbmi.mb_skip_coeff = skip; + } + else + { + int i, skip = 1; + + for (i = 0; i < 16; i++) + skip &= (x->block[i].eob < 2); + + for (i = 16; i < 25; i++) + skip &= (!x->block[i].eob); + + if (skip != x->mbmi.mb_skip_coeff) + skip += 0; + + x->mbmi.mb_skip_coeff = skip; + } + + vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts)); +#endif + + if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV) + { + plane_type = 3; + } + else + { + tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type, + A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi); + plane_type = 0; + + } + + for (b = 0; b < 16; b++) + tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + + for (b = 16; b < 24; b++) + tokenize1st_order_b(x->block + b, t, 2, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + +#if 0 + + if (cpi->common.mb_no_coeff_skip) + { + int skip = 1; + + while ((tp != *t) && skip) + { + skip = (skip && (tp->Token == DCT_EOB_TOKEN)); + tp ++; + } + + if (skip != x->mbmi.mb_skip_coeff) + skip += 0; + + x->mbmi.mb_skip_coeff = skip; + + if (x->mbmi.mb_skip_coeff == 1) + { + x->mbmi.dc_diff = 0; + //redo the coutnts + vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts)); + + *t = start; + cpi->skip_true_count++; + //skip_true_count++; + } + else + { + + cpi->skip_false_count++; + //skip_false_count++; + } + } + +#endif +} +#endif + +#ifdef ENTROPY_STATS + +void init_context_counters(void) +{ + vpx_memset(context_counters, 0, sizeof(context_counters)); +} + +void print_context_counters() +{ + + int type, band, pt, t; + + FILE *const f = fopen("context.c", "w"); + + fprintf(f, "#include \"entropy.h\"\n"); + + fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); + + fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];\n\n"); + + fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] = {"); + +# define Comma( X) (X? ",":"") + + type = 0; + + do + { + fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); + + band = 0; + + do + { + fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); + + pt = 0; + + do + { + fprintf(f, "%s\n {", Comma(pt)); + + t = 0; + + do + { + const _int64 x = context_counters [type] [band] [pt] [t]; + const int y = (int) x; + + assert(x == (_int64) y); /* no overflow handling yet */ + fprintf(f, "%s %d", Comma(t), y); + + } + while (++t < vp8_coef_tokens); + + fprintf(f, "}"); + } + while (++pt < PREV_COEF_CONTEXTS); + + fprintf(f, "\n }"); + + } + while (++band < COEF_BANDS); + + fprintf(f, "\n }"); + } + while (++type < BLOCK_TYPES); + + fprintf(f, "\n};\n"); + fclose(f); +} +#endif + + +void vp8_tokenize_initialize() +{ + fill_value_tokens(); +} + + +static __inline void stuff2nd_order_b +( + const BLOCKD *const b, + TOKENEXTRA **tp, + const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + const FRAME_TYPE frametype, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + VP8_COMP *cpi +) +{ + int pt; /* near block/prev token context index */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + (void) frametype; + (void) type; + (void) b; + + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt]; + t->section = 11; + t->skip_eob_node = 0; + ++cpi->coef_counts [1] [0] [pt] [DCT_EOB_TOKEN]; + ++t; + + *tp = t; + pt = 0; + *a = *l = pt; + +} + +static __inline void stuff1st_order_b +( + const BLOCKD *const b, + TOKENEXTRA **tp, + const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + const FRAME_TYPE frametype, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + VP8_COMP *cpi +) +{ + int pt; /* near block/prev token context index */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + (void) frametype; + (void) type; + (void) b; + + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt]; + t->section = 8; + t->skip_eob_node = 0; + ++cpi->coef_counts [0] [1] [pt] [DCT_EOB_TOKEN]; + ++t; + *tp = t; + pt = 0; /* 0 <-> all coeff data is zero */ + *a = *l = pt; + +} +static __inline +void stuff1st_order_buv +( + const BLOCKD *const b, + TOKENEXTRA **tp, + const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + const FRAME_TYPE frametype, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + VP8_COMP *cpi +) +{ + int pt; /* near block/prev token context index */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + (void) frametype; + (void) type; + (void) b; + + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt]; + t->section = 13; + t->skip_eob_node = 0; + ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN]; + ++t; + *tp = t; + pt = 0; /* 0 <-> all coeff data is zero */ + *a = *l = pt; + +} + +void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) +{ + //int i; + ENTROPY_CONTEXT **const A = x->above_context; + ENTROPY_CONTEXT(* const L)[4] = x->left_context; + int plane_type; + int b; + + stuff2nd_order_b(x->block + 24, t, 1, x->frame_type, + A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi); + plane_type = 0; + + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + x->mbmi.dc_diff = 0; + else + x->mbmi.dc_diff = 1; + + + for (b = 0; b < 16; b++) + stuff1st_order_b(x->block + b, t, plane_type, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + + for (b = 16; b < 24; b++) + stuff1st_order_buv(x->block + b, t, 2, x->frame_type, + A[vp8_block2context[b]] + vp8_block2above[b], + L[vp8_block2context[b]] + vp8_block2left[b], cpi); + +} +void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x) +{ + x->left_context[Y1CONTEXT][0] = 0; + x->left_context[Y1CONTEXT][1] = 0; + x->left_context[Y1CONTEXT][2] = 0; + x->left_context[Y1CONTEXT][3] = 0; + x->left_context[UCONTEXT][0] = 0; + x->left_context[VCONTEXT][0] = 0; + x->left_context[UCONTEXT][1] = 0; + x->left_context[VCONTEXT][1] = 0; + + x->above_context[Y1CONTEXT][0] = 0; + x->above_context[Y1CONTEXT][1] = 0; + x->above_context[Y1CONTEXT][2] = 0; + x->above_context[Y1CONTEXT][3] = 0; + x->above_context[UCONTEXT][0] = 0; + x->above_context[VCONTEXT][0] = 0; + x->above_context[UCONTEXT][1] = 0; + x->above_context[VCONTEXT][1] = 0; + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + x->left_context[Y2CONTEXT][0] = 0; + x->above_context[Y2CONTEXT][0] = 0; + } +} diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h new file mode 100644 index 000000000..02aacc222 --- /dev/null +++ b/vp8/encoder/tokenize.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef tokenize_h +#define tokenize_h + +#include "entropy.h" +#include "block.h" + +void vp8_tokenize_initialize(); + +typedef struct +{ + int Token; + int Extra; + const vp8_prob *context_tree; + int skip_eob_node; + int section; +} TOKENEXTRA; + +int rd_cost_mby(MACROBLOCKD *); + +#ifdef ENTROPY_STATS +void init_context_counters(); +void print_context_counters(); + +extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]; +#endif + + +#endif /* tokenize_h */ diff --git a/vp8/encoder/treewriter.c b/vp8/encoder/treewriter.c new file mode 100644 index 000000000..e398044db --- /dev/null +++ b/vp8/encoder/treewriter.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "treewriter.h" + +static void cost( + int *const C, + vp8_tree T, + const vp8_prob *const P, + int i, + int c +) +{ + const vp8_prob p = P [i>>1]; + + do + { + const vp8_tree_index j = T[i]; + const int d = c + vp8_cost_bit(p, i & 1); + + if (j <= 0) + C[-j] = d; + else + cost(C, T, P, j, d); + } + while (++i & 1); +} +void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t) +{ + cost(c, t, p, 0, 0); +} diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h new file mode 100644 index 000000000..05ac74cb7 --- /dev/null +++ b/vp8/encoder/treewriter.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __INC_TREEWRITER_H +#define __INC_TREEWRITER_H + +/* Trees map alphabets into huffman-like codes suitable for an arithmetic + bit coder. Timothy S Murphy 11 October 2004 */ + +#include "treecoder.h" + +#include "boolhuff.h" /* for now */ + +typedef BOOL_CODER vp8_writer; + +#define vp8_write vp8_encode_bool +#define vp8_write_literal vp8_encode_value +#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half) + +#define vp8bc_write vp8bc_write_bool +#define vp8bc_write_literal vp8bc_write_bits +#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1) + + +/* Approximate length of an encoded bool in 256ths of a bit at given prob */ + +#define vp8_cost_zero( x) ( vp8_prob_cost[x]) +#define vp8_cost_one( x) vp8_cost_zero( vp8_complement(x)) + +#define vp8_cost_bit( x, b) vp8_cost_zero( (b)? vp8_complement(x) : (x) ) + +/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */ + + +/* Both of these return bits, not scaled bits. */ + +static __inline unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p) +{ + /* Imitate existing calculation */ + + return ((ct[0] * vp8_cost_zero(p)) + + (ct[1] * vp8_cost_one(p))) >> 8; +} + +/* Small functions to write explicit values and tokens, as well as + estimate their lengths. */ + +static __inline void vp8_treed_write +( + vp8_writer *const w, + vp8_tree t, + const vp8_prob *const p, + int v, + int n /* number of bits in v, assumed nonzero */ +) +{ + vp8_tree_index i = 0; + + do + { + const int b = (v >> --n) & 1; + vp8_write(w, b, p[i>>1]); + i = t[i+b]; + } + while (n); +} +static __inline void vp8_write_token +( + vp8_writer *const w, + vp8_tree t, + const vp8_prob *const p, + vp8_token *const x +) +{ + vp8_treed_write(w, t, p, x->value, x->Len); +} + +static __inline int vp8_treed_cost( + vp8_tree t, + const vp8_prob *const p, + int v, + int n /* number of bits in v, assumed nonzero */ +) +{ + int c = 0; + vp8_tree_index i = 0; + + do + { + const int b = (v >> --n) & 1; + c += vp8_cost_bit(p[i>>1], b); + i = t[i+b]; + } + while (n); + + return c; +} +static __inline int vp8_cost_token +( + vp8_tree t, + const vp8_prob *const p, + vp8_token *const x +) +{ + return vp8_treed_cost(t, p, x->value, x->Len); +} + +/* Fill array of costs for all possible token values. */ + +void vp8_cost_tokens( + int *Costs, const vp8_prob *, vp8_tree +); + +#endif diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h new file mode 100644 index 000000000..b3b55c319 --- /dev/null +++ b/vp8/encoder/variance.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VARIANCE_H +#define VARIANCE_H + +#define prototype_sad(sym)\ + unsigned int (sym)\ + (\ + unsigned char *src_ptr, \ + int source_stride, \ + unsigned char *ref_ptr, \ + int ref_stride, \ + int max_sad\ + ) + +#define prototype_sad_multi_same_address(sym)\ + void (sym)\ + (\ + unsigned char *src_ptr, \ + int source_stride, \ + unsigned char *ref_ptr, \ + int ref_stride, \ + unsigned int *sad_array\ + ) + +#define prototype_sad_multi_dif_address(sym)\ + void (sym)\ + (\ + unsigned char *src_ptr, \ + int source_stride, \ + unsigned char *ref_ptr[4], \ + int ref_stride, \ + unsigned int *sad_array\ + ) + +#define prototype_variance(sym) \ + unsigned int (sym) \ + (\ + unsigned char *src_ptr, \ + int source_stride, \ + unsigned char *ref_ptr, \ + int ref_stride, \ + unsigned int *sse\ + ) + +#define prototype_variance2(sym) \ + unsigned int (sym) \ + (\ + unsigned char *src_ptr, \ + int source_stride, \ + unsigned char *ref_ptr, \ + int ref_stride, \ + unsigned int *sse,\ + int *sum\ + ) + +#define prototype_subpixvariance(sym) \ + unsigned int (sym) \ + ( \ + unsigned char *src_ptr, \ + int source_stride, \ + int xoffset, \ + int yoffset, \ + unsigned char *ref_ptr, \ + int Refstride, \ + unsigned int *sse \ + ); + + +#define prototype_getmbss(sym) unsigned int (sym)(short *) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/variance_x86.h" +#endif + +#if ARCH_ARM +#include "arm/variance_arm.h" +#endif + +#ifndef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_c +#endif +extern prototype_sad(vp8_variance_sad4x4); + +#ifndef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_c +#endif +extern prototype_sad(vp8_variance_sad8x8); + +#ifndef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_c +#endif +extern prototype_sad(vp8_variance_sad8x16); + +#ifndef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_c +#endif +extern prototype_sad(vp8_variance_sad16x8); + +#ifndef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_c +#endif +extern prototype_sad(vp8_variance_sad16x16); + +//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +#ifndef vp8_variance_sad16x16x3 +#define vp8_variance_sad16x16x3 vp8_sad16x16x3_c +#endif +extern prototype_sad_multi_same_address(vp8_variance_sad16x16x3); + +#ifndef vp8_variance_sad16x8x3 +#define vp8_variance_sad16x8x3 vp8_sad16x8x3_c +#endif +extern prototype_sad_multi_same_address(vp8_variance_sad16x8x3); + +#ifndef vp8_variance_sad8x8x3 +#define vp8_variance_sad8x8x3 vp8_sad8x8x3_c +#endif +extern prototype_sad_multi_same_address(vp8_variance_sad8x8x3); + +#ifndef vp8_variance_sad8x16x3 +#define vp8_variance_sad8x16x3 vp8_sad8x16x3_c +#endif +extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3); + +#ifndef vp8_variance_sad4x4x3 +#define vp8_variance_sad4x4x3 vp8_sad4x4x3_c +#endif +extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3); + +//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +#ifndef vp8_variance_sad16x16x4d +#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c +#endif +extern prototype_sad_multi_dif_address(vp8_variance_sad16x16x4d); + +#ifndef vp8_variance_sad16x8x4d +#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_c +#endif +extern prototype_sad_multi_dif_address(vp8_variance_sad16x8x4d); + +#ifndef vp8_variance_sad8x8x4d +#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_c +#endif +extern prototype_sad_multi_dif_address(vp8_variance_sad8x8x4d); + +#ifndef vp8_variance_sad8x16x4d +#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_c +#endif +extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d); + +#ifndef vp8_variance_sad4x4x4d +#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_c +#endif +extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d); + +//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +#ifndef vp8_variance_var4x4 +#define vp8_variance_var4x4 vp8_variance4x4_c +#endif +extern prototype_variance(vp8_variance_var4x4); + +#ifndef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_c +#endif +extern prototype_variance(vp8_variance_var8x8); + +#ifndef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_c +#endif +extern prototype_variance(vp8_variance_var8x16); + +#ifndef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_c +#endif +extern prototype_variance(vp8_variance_var16x8); + +#ifndef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_c +#endif +extern prototype_variance(vp8_variance_var16x16); + +//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +#ifndef vp8_variance_subpixvar4x4 +#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixvar4x4); + +#ifndef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixvar8x8); + +#ifndef vp8_variance_subpixvar8x16 +#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixvar8x16); + +#ifndef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixvar16x8); + +#ifndef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixvar16x16); + +#ifndef vp8_variance_subpixmse16x16 +#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c +#endif +extern prototype_subpixvariance(vp8_variance_subpixmse16x16); + +//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +#ifndef vp8_variance_getmbss +#define vp8_variance_getmbss vp8_get_mb_ss_c +#endif +extern prototype_getmbss(vp8_variance_getmbss); + +#ifndef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_c +#endif +extern prototype_variance(vp8_variance_mse16x16); + +#ifndef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c +#endif +extern prototype_sad(vp8_variance_get16x16prederror); + +#ifndef vp8_variance_get8x8var +#define vp8_variance_get8x8var vp8_get8x8var_c +#endif +extern prototype_variance2(vp8_variance_get8x8var); + +#ifndef vp8_variance_get16x16var +#define vp8_variance_get16x16var vp8_get16x16var_c +#endif +extern prototype_variance2(vp8_variance_get16x16var); + +#ifndef vp8_variance_get4x4sse_cs +#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c +#endif +extern prototype_sad(vp8_variance_get4x4sse_cs); + + +typedef prototype_sad(*vp8_sad_fn_t); +typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); +typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t); +typedef prototype_variance(*vp8_variance_fn_t); +typedef prototype_variance2(*vp8_variance2_fn_t); +typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t); +typedef prototype_getmbss(*vp8_getmbss_fn_t); +typedef struct +{ + vp8_sad_fn_t sad4x4; + vp8_sad_fn_t sad8x8; + vp8_sad_fn_t sad8x16; + vp8_sad_fn_t sad16x8; + vp8_sad_fn_t sad16x16; + + vp8_variance_fn_t var4x4; + vp8_variance_fn_t var8x8; + vp8_variance_fn_t var8x16; + vp8_variance_fn_t var16x8; + vp8_variance_fn_t var16x16; + + vp8_subpixvariance_fn_t subpixvar4x4; + vp8_subpixvariance_fn_t subpixvar8x8; + vp8_subpixvariance_fn_t subpixvar8x16; + vp8_subpixvariance_fn_t subpixvar16x8; + vp8_subpixvariance_fn_t subpixvar16x16; + vp8_subpixvariance_fn_t subpixmse16x16; + + vp8_getmbss_fn_t getmbss; + vp8_variance_fn_t mse16x16; + + vp8_sad_fn_t get16x16prederror; + vp8_variance2_fn_t get8x8var; + vp8_variance2_fn_t get16x16var; + vp8_sad_fn_t get4x4sse_cs; + + vp8_sad_multi_fn_t sad16x16x3; + vp8_sad_multi_fn_t sad16x8x3; + vp8_sad_multi_fn_t sad8x16x3; + vp8_sad_multi_fn_t sad8x8x3; + vp8_sad_multi_fn_t sad4x4x3; + + vp8_sad_multi_d_fn_t sad16x16x4d; + vp8_sad_multi_d_fn_t sad16x8x4d; + vp8_sad_multi_d_fn_t sad8x16x4d; + vp8_sad_multi_d_fn_t sad8x8x4d; + vp8_sad_multi_d_fn_t sad4x4x4d; + +} vp8_variance_rtcd_vtable_t; + +typedef struct +{ + vp8_sad_fn_t sdf; + vp8_sad_multi_fn_t sdx3f; + vp8_sad_multi_d_fn_t sdx4df; + vp8_variance_fn_t vf; + vp8_subpixvariance_fn_t svf; +} vp8_variance_fn_ptr_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn +#else +#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn +#endif + +/* TODO: Determine if this USEBILINEAR flag is necessary. */ +#define USEBILINEAR + +#endif diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c new file mode 100644 index 000000000..85269b9d3 --- /dev/null +++ b/vp8/encoder/variance_c.c @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" + +const int vp8_six_tap[8][6] = +{ + { 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter + { 0, -1, 12, 123, -6, 0 } +}; + + +#ifdef USEBILINEAR +const int VP8_FILTER_WEIGHT = 128; +const int VP8_FILTER_SHIFT = 7; +const int vp8_bilinear_taps[8][2] = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + +unsigned int vp8_get_mb_ss_c +( + short *src_ptr +) +{ + unsigned int i = 0, sum = 0; + + do + { + sum += (src_ptr[i] * src_ptr[i]); + i++; + } + while (i < 256); + + return sum; +} + + +void vp8_variance( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) +{ + int i, j; + int diff; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) + { + for (j = 0; j < w; j++) + { + diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } +} + +unsigned int +vp8_get8x8var_c +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +) +{ + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum); + return (*SSE - (((*Sum) * (*Sum)) >> 6)); +} + +unsigned int +vp8_get16x16var_c +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +) +{ + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum); + return (*SSE - (((*Sum) * (*Sum)) >> 8)); + +} + + + +unsigned int vp8_variance16x16_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); + *sse = var; + return (var - ((avg * avg) >> 8)); +} + +unsigned int vp8_variance8x16_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); + *sse = var; + return (var - ((avg * avg) >> 7)); +} + +unsigned int vp8_variance16x8_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); + *sse = var; + return (var - ((avg * avg) >> 7)); +} + + +unsigned int vp8_variance8x8_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); + *sse = var; + return (var - ((avg * avg) >> 6)); +} + +unsigned int vp8_variance4x4_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); + *sse = var; + return (var - ((avg * avg) >> 4)); +} + + +unsigned int vp8_mse16x16_c( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); + *sse = var; + return var; +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +void vp8e_filter_block2d_bil_first_pass +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + // Apply bilinear filter + output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +void vp8e_filter_block2d_bil_second_pass +( + unsigned short *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + // Apply filter + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an 8x8 input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The intermediate horizontally filtered block must produce + * 1 more point than the input block in each column. This + * is to ensure that the 2-tap filter has one extra data-point + * at the top of each column so filter taps do not extend + * beyond data. Thus the output of the first stage filter + * is an 8x9 (hx_v) block. + * + ****************************************************************************/ +void vp8e_filter_block2d_bil +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int *HFilter, + int *VFilter +) +{ + + unsigned short FData[20*16]; // Temp data bufffer used in filtering + + // First filter 1-D horizontally... + vp8e_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, 9, 8, HFilter); + + // then 1-D vertically... + vp8e_filter_block2d_bil_second_pass(FData, output_ptr, 8, 8, 8, 8, VFilter); +} + + + +unsigned int vp8_sub_pixel_variance4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned char temp2[20*16]; + const int *HFilter, *VFilter; + unsigned short FData3[5*4]; // Temp data bufffer used in filtering + + HFilter = vp8_bilinear_taps[xoffset]; + VFilter = vp8_bilinear_taps[yoffset]; + + // First filter 1d Horizontal + vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); + + // Now filter Verticaly + vp8e_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); + + return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); +} + + +unsigned int vp8_sub_pixel_variance8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[9*8]; // Temp data bufffer used in filtering + unsigned char temp2[20*16]; + const int *HFilter, *VFilter; + + HFilter = vp8_bilinear_taps[xoffset]; + VFilter = vp8_bilinear_taps[yoffset]; + + vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); + vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); + + return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp8_sub_pixel_variance16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[17*16]; // Temp data bufffer used in filtering + unsigned char temp2[20*16]; + const int *HFilter, *VFilter; + + HFilter = vp8_bilinear_taps[xoffset]; + VFilter = vp8_bilinear_taps[yoffset]; + + vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); + vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); + + return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp8_sub_pixel_mse16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + return *sse; +} + +unsigned int vp8_sub_pixel_variance16x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[16*9]; // Temp data bufffer used in filtering + unsigned char temp2[20*16]; + const int *HFilter, *VFilter; + + HFilter = vp8_bilinear_taps[xoffset]; + VFilter = vp8_bilinear_taps[yoffset]; + + vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); + vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); + + return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp8_sub_pixel_variance8x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[9*16]; // Temp data bufffer used in filtering + unsigned char temp2[20*16]; + const int *HFilter, *VFilter; + + + HFilter = vp8_bilinear_taps[xoffset]; + VFilter = vp8_bilinear_taps[yoffset]; + + + vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); + vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); + + return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); +} +#endif diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c new file mode 100644 index 000000000..186ee6856 --- /dev/null +++ b/vp8/encoder/x86/csystemdependent.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "onyx_int.h" + +SADFunction *vp8_sad16x16; +SADFunction *vp8_sad16x8; +SADFunction *vp8_sad8x16; +SADFunction *vp8_sad8x8; +SADFunction *vp8_sad4x4; + +variance_function *vp8_variance4x4; +variance_function *vp8_variance8x8; +variance_function *vp8_variance8x16; +variance_function *vp8_variance16x8; +variance_function *vp8_variance16x16; + + +variance_function *vp8_mse16x16; + +sub_pixel_variance_function *vp8_sub_pixel_variance4x4; +sub_pixel_variance_function *vp8_sub_pixel_variance8x8; +sub_pixel_variance_function *vp8_sub_pixel_variance8x16; +sub_pixel_variance_function *vp8_sub_pixel_variance16x8; +sub_pixel_variance_function *vp8_sub_pixel_variance16x16; + +int (*vp8_block_error)(short *, short *); +int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc); +void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); + +extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); +extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride); + +extern int vp8_block_error_c(short *, short *); +extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc); + +extern int vp8_block_error_mmx(short *, short *); +extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc); + +extern int vp8_block_error_xmm(short *, short *); +extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc); + + + +int (*vp8_mbuverror)(MACROBLOCK *mb); +unsigned int (*vp8_get_mb_ss)(short *); +void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); + +void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); +void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); +unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// c imports +extern int vp8_mbuverror_c(MACROBLOCK *mb); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch); + + +extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); +extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); + +extern SADFunction vp8_sad16x16_c; +extern SADFunction vp8_sad16x8_c; +extern SADFunction vp8_sad8x16_c; +extern SADFunction vp8_sad8x8_c; +extern SADFunction vp8_sad4x4_c; + +extern SADFunction vp8_sad16x16_wmt; +extern SADFunction vp8_sad16x8_wmt; +extern SADFunction vp8_sad8x16_wmt; +extern SADFunction vp8_sad8x8_wmt; +extern SADFunction vp8_sad4x4_wmt; + +extern SADFunction vp8_sad16x16_mmx; +extern SADFunction vp8_sad16x8_mmx; +extern SADFunction vp8_sad8x16_mmx; +extern SADFunction vp8_sad8x8_mmx; +extern SADFunction vp8_sad4x4_mmx; + +extern variance_function vp8_variance16x16_c; +extern variance_function vp8_variance8x16_c; +extern variance_function vp8_variance16x8_c; +extern variance_function vp8_variance8x8_c; +extern variance_function vp8_variance4x4_c; +extern variance_function vp8_mse16x16_c; + +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c; + +extern unsigned int vp8_get_mb_ss_c(short *); +extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// mmx imports +extern int vp8_mbuverror_mmx(MACROBLOCK *mb); +extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d); +extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch); +extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); +extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); +extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch); +extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch); +extern variance_function vp8_variance4x4_mmx; +extern variance_function vp8_variance8x8_mmx; +extern variance_function vp8_variance8x16_mmx; +extern variance_function vp8_variance16x8_mmx; +extern variance_function vp8_variance16x16_mmx; + +extern variance_function vp8_mse16x16_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx; + +extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get_mb_ss_mmx(short *); +extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + + +// wmt imports +extern int vp8_mbuverror_xmm(MACROBLOCK *mb); +extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d); +extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch); +extern variance_function vp8_variance4x4_wmt; +extern variance_function vp8_variance8x8_wmt; +extern variance_function vp8_variance8x16_wmt; +extern variance_function vp8_variance16x8_wmt; +extern variance_function vp8_variance16x16_wmt; + +extern variance_function vp8_mse16x16_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt; +extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr); +extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); + +extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +void vp8_cmachine_specific_config(void) +{ + int mmx_enabled; + int xmm_enabled; + int wmt_enabled; + + vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); + + if (wmt_enabled) // Willamette + { + // Willamette instruction set available: + vp8_mbuverror = vp8_mbuverror_xmm; + vp8_fast_quantize_b = vp8_fast_quantize_b_sse; + vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt; + vp8_subtract_b = vp8_subtract_b_mmx; + vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; + vp8_variance4x4 = vp8_variance4x4_mmx; + vp8_variance8x8 = vp8_variance8x8_mmx; + vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_get16x16pred_error = vp8_get16x16pred_error_sse2; + vp8_get8x8var = vp8_get8x8var_sse2; + vp8_get16x16var = vp8_get16x16var_sse2; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_sad16x16 = vp8_sad16x16_wmt; + vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_block_error = vp8_block_error_xmm; + vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_subtract_mby = vp8_subtract_mby_mmx; + + } + else if (mmx_enabled) + { + // MMX instruction set available: + vp8_mbuverror = vp8_mbuverror_mmx; + vp8_fast_quantize_b = vp8_fast_quantize_b_mmx; + vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx; + vp8_subtract_b = vp8_subtract_b_mmx; + vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; + vp8_variance4x4 = vp8_variance4x4_mmx; + vp8_variance8x8 = vp8_variance8x8_mmx; + vp8_variance8x16 = vp8_variance8x16_mmx; + vp8_variance16x8 = vp8_variance16x8_mmx; + vp8_variance16x16 = vp8_variance16x16_mmx; + vp8_mse16x16 = vp8_mse16x16_mmx; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_mmx; + vp8_get16x16pred_error = vp8_get16x16pred_error_mmx; + vp8_get8x8var = vp8_get8x8var_mmx; + vp8_get16x16var = vp8_get16x16var_mmx; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_sad16x16 = vp8_sad16x16_mmx; + vp8_sad16x8 = vp8_sad16x8_mmx; + vp8_sad8x16 = vp8_sad8x16_mmx; + vp8_sad8x8 = vp8_sad8x8_mmx; + vp8_sad4x4 = vp8_sad4x4_mmx; + vp8_block_error = vp8_block_error_mmx; + vp8_mbblock_error = vp8_mbblock_error_mmx; + vp8_subtract_mby = vp8_subtract_mby_mmx; + + } + else + { + // Pure C: + vp8_mbuverror = vp8_mbuverror_c; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; + vp8_subtract_b = vp8_subtract_b_c; + vp8_subtract_mbuv = vp8_subtract_mbuv_c; + vp8_variance4x4 = vp8_variance4x4_c; + vp8_variance8x8 = vp8_variance8x8_c; + vp8_variance8x16 = vp8_variance8x16_c; + vp8_variance16x8 = vp8_variance16x8_c; + vp8_variance16x16 = vp8_variance16x16_c; + vp8_mse16x16 = vp8_mse16x16_c; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + vp8_get_mb_ss = vp8_get_mb_ss_c; + vp8_get16x16pred_error = vp8_get16x16pred_error_c; + vp8_get8x8var = vp8_get8x8var_c; + vp8_get16x16var = vp8_get16x16var_c; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + vp8_sad16x16 = vp8_sad16x16_c; + vp8_sad16x8 = vp8_sad16x8_c; + vp8_sad8x16 = vp8_sad8x16_c; + vp8_sad8x8 = vp8_sad8x8_c; + vp8_sad4x4 = vp8_sad4x4_c; + vp8_block_error = vp8_block_error_c; + vp8_mbblock_error = vp8_mbblock_error_c; + vp8_subtract_mby = vp8_subtract_mby_c; + } + +} diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm new file mode 100644 index 000000000..e13423796 --- /dev/null +++ b/vp8/encoder/x86/dct_mmx.asm @@ -0,0 +1,846 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text + global sym(vp8_short_fdct4x4_mmx) + global sym(vp8_fast_fdct4x4_mmx) + global sym(vp8_fast_fdct8x4_wmt) + + +%define DCTCONSTANTSBITS (16) +%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) +%define x_c1 (60547) ; cos(pi /8) * (1<<15) +%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) +%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) + + +%define _1STSTAGESHIFT 14 +%define _2NDSTAGESHIFT 16 + +; using matrix multiply with source and destbuffer has a pitch +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_short_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + movsxd rax, dword ptr arg(2) ;pitch + lea rdx, [dct_matrix GLOBAL] + + movq mm0, [rsi ] + movq mm1, [rsi + rax] + + movq mm2, [rsi + rax*2] + lea rsi, [rsi + rax*2] + + movq mm3, [rsi + rax] + + ; first column + movq mm4, mm0 + movq mm7, [rdx] + + pmaddwd mm4, mm7 + movq mm5, mm1 + + pmaddwd mm5, mm7 + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + + pmaddwd mm5, mm7 + movq mm6, mm3 + + pmaddwd mm6, mm7 + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column (this is the last column, so we do not have save the source any more) + + pmaddwd mm0, [rdx+24] + + pmaddwd mm1, [rdx+24] + movq mm6, mm0 + + punpckldq mm0, mm1 + punpckhdq mm6, mm1 + + paddd mm0, mm6 + + pmaddwd mm2, [rdx+24] + + pmaddwd mm3, [rdx+24] + movq mm7, mm2 + + punpckldq mm2, mm3 + punpckhdq mm7, mm3 + + paddd mm2, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm0, mm6 + paddd mm2, mm6 + + psrad mm0, _1STSTAGESHIFT + psrad mm2, _1STSTAGESHIFT + + packssdw mm0, mm2 + + movq mm3, mm0 + + ; done with one pass + ; now start second pass + movq mm0, [rdi ] + movq mm1, [rdi+ 8] + movq mm2, [rdi+ 16] + + movq mm4, mm0 + + pmaddwd mm4, [rdx] + movq mm5, mm1 + + pmaddwd mm5, [rdx] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx] + movq mm6, mm3 + + pmaddwd mm6, [rdx] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column + movq mm4, mm0 + + pmaddwd mm4, [rdx+24] + movq mm5, mm1 + + pmaddwd mm5, [rdx+24] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+24] + movq mm6, mm3 + + pmaddwd mm6, [rdx+24] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+24], mm4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_fast_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_mmx GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movq mm0, [rsi] + movq mm1, [rsi + rax ] + + movq mm2, [rcx] + movq mm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + paddw mm0, mm0 + paddw mm1, mm1 + + psllw mm2, 1 + psllw mm3, 1 + + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + + paddw mm0, mm1 ; a + b + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + + + ; done with vertical + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + paddw mm0, mm1 ; a + b + + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + ; done with vertical + + pcmpeqw mm4, mm4 + pcmpeqw mm5, mm5 + psrlw mm4, 15 + psrlw mm5, 15 + + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm4 + paddw mm3, mm5 + + psraw mm0, 1 + psraw mm1, 1 + psraw mm2, 1 + psraw mm3, 1 + + movq [rdi ], mm0 + movq [rdi+ 8], mm1 + movq [rdi+16], mm2 + movq [rdi+24], mm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) +sym(vp8_fast_fdct8x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_xmm GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movdqa xmm0, [rsi] + movdqa xmm2, [rsi + rax] + + movdqa xmm4, [rcx] + movdqa xmm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + psllw xmm0, 1 + psllw xmm2, 1 + + psllw xmm4, 1 + psllw xmm3, 1 + + ; transpose for the second stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 + movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 + + movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 + + + movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 + + movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + ; done with vertical + + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm5, xmm5; + psrlw xmm4, 15 + psrlw xmm5, 15 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + psraw xmm0, 1 + psraw xmm1, 1 + psraw xmm2, 1 + psraw xmm3, 1 + + movq QWORD PTR[rdi ], xmm0 + movq QWORD PTR[rdi+ 8], xmm1 + movq QWORD PTR[rdi+16], xmm2 + movq QWORD PTR[rdi+24], xmm3 + + psrldq xmm0, 8 + psrldq xmm1, 8 + psrldq xmm2, 8 + psrldq xmm3, 8 + + movq QWORD PTR[rdi+32], xmm0 + movq QWORD PTR[rdi+40], xmm1 + movq QWORD PTR[rdi+48], xmm2 + movq QWORD PTR[rdi+56], xmm3 + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;static const unsigned int dct1st_stage_rounding_mmx[2] = +align 16 +dct1st_stage_rounding_mmx: + times 2 dd 8192 + + +;static const unsigned int dct2nd_stage_rounding_mmx[2] = +align 16 +dct2nd_stage_rounding_mmx: + times 2 dd 32768 + + +;static const short dct_matrix[4][4]= +align 16 +dct_matrix: + times 4 dw 23170 + + dw 30274 + dw 12540 + dw -12540 + dw -30274 + + dw 23170 + times 2 dw -23170 + dw 23170 + + dw 12540 + dw -30274 + dw 30274 + dw -12540 + + +;static const unsigned short dct_const_mmx[4 * 4]= +align 16 +dct_const_mmx: + times 4 dw 0 + times 4 dw 60547 + times 4 dw 46341 + times 4 dw 25080 + + +;static const unsigned short dct_const_xmm[8 * 4]= +align 16 +dct_const_xmm: + times 8 dw 0 + times 8 dw 60547 + times 8 dw 46341 + times 8 dw 25080 diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..3e5e9a70c --- /dev/null +++ b/vp8/encoder/x86/dct_sse2.asm @@ -0,0 +1,260 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vp8_short_fdct4x4_wmt) + +%define DCTCONSTANTSBITS (16) +%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) +%define x_c1 (60547) ; cos(pi /8) * (1<<15) +%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) +%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) + +%define _1STSTAGESHIFT 14 +%define _2NDSTAGESHIFT 16 + + +;; using matrix multiply +;void vp8_short_fdct4x4_wmt(short *input, short *output) +sym(vp8_short_fdct4x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + GET_GOT rbx + ; end prolog + + mov rax, arg(0) ;input + mov rcx, arg(1) ;output + + lea rdx, [dct_matrix_sse2 GLOBAL] + + movdqu xmm0, [rax ] + movdqu xmm1, [rax+16] + + ; first column + movdqa xmm2, xmm0 + movdqa xmm7, [rdx] + + pmaddwd xmm2, xmm7 + movdqa xmm3, xmm1 + + pmaddwd xmm3, xmm7 + movdqa xmm4, xmm2 + + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + movdqa xmm3, xmm2 + punpckldq xmm2, xmm4 + + punpckhdq xmm3, xmm4 + paddd xmm2, xmm3 + + + paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + psrad xmm2, _1STSTAGESHIFT + ;second column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+16] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+16] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + + psrad xmm3, _1STSTAGESHIFT + packssdw xmm2, xmm3 + + ;third column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+32] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+32] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _1STSTAGESHIFT + + ;fourth column (this is the last column, so we do not have save the source any more) + pmaddwd xmm0, [rdx+48] + pmaddwd xmm1, [rdx+48] + + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + + punpckhdq xmm4, xmm1 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm4 + punpckhdq xmm1, xmm4 + + paddd xmm0, xmm1 + paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + + psrad xmm0, _1STSTAGESHIFT + packssdw xmm3, xmm0 + ; done with one pass + ; now start second pass + movdqa xmm0, xmm2 + movdqa xmm1, xmm3 + + pmaddwd xmm2, xmm7 + pmaddwd xmm3, xmm7 + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + + punpckhdq xmm4, xmm3 + movdqa xmm3, xmm2 + + punpckldq xmm2, xmm4 + punpckhdq xmm3, xmm4 + + paddd xmm2, xmm3 + paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm2, _2NDSTAGESHIFT + + ;second column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+16] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+16] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _2NDSTAGESHIFT + packssdw xmm2, xmm3 + + movdqu [rcx], xmm2 + ;third column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+32] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+32] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _2NDSTAGESHIFT + ;fourth column + pmaddwd xmm0, [rdx+48] + pmaddwd xmm1, [rdx+48] + + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + + punpckhdq xmm4, xmm1 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm4 + punpckhdq xmm1, xmm4 + + paddd xmm0, xmm1 + paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm0, _2NDSTAGESHIFT + packssdw xmm3, xmm0 + + movdqu [rcx+16], xmm3 + + mov rsp, rbp + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;static unsigned int dct1st_stage_rounding_sse2[4] = +align 16 +dct1st_stage_rounding_sse2: + times 4 dd 8192 + + +;static unsigned int dct2nd_stage_rounding_sse2[4] = +align 16 +dct2nd_stage_rounding_sse2: + times 4 dd 32768 + +;static short dct_matrix_sse2[4][8]= +align 16 +dct_matrix_sse2: + times 8 dw 23170 + + dw 30274 + dw 12540 + dw -12540 + dw -30274 + dw 30274 + dw 12540 + dw -12540 + dw -30274 + + dw 23170 + times 2 dw -23170 + times 2 dw 23170 + times 2 dw -23170 + dw 23170 + + dw 12540 + dw -30274 + dw 30274 + dw -12540 + dw 12540 + dw -30274 + dw 30274 + dw -12540 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h new file mode 100644 index 000000000..bc80e64ef --- /dev/null +++ b/vp8/encoder/x86/dct_x86.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef DCT_X86_H +#define DCT_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_fdct(vp8_short_fdct4x4_mmx); +extern prototype_fdct(vp8_short_fdct8x4_mmx); +extern prototype_fdct(vp8_fast_fdct4x4_mmx); +extern prototype_fdct(vp8_fast_fdct8x4_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx + +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_fdct(vp8_short_fdct4x4_wmt); +extern prototype_fdct(vp8_short_fdct8x4_wmt); +extern prototype_fdct(vp8_fast_fdct8x4_wmt); + +extern prototype_fdct(vp8_short_walsh4x4_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#if 0 +/* short SSE2 DCT currently disabled, does not match the MMX version */ +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt +#endif + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt + +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2 + +#endif + + +#endif + +#endif diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h new file mode 100644 index 000000000..9397a6cca --- /dev/null +++ b/vp8/encoder/x86/encodemb_x86.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef ENCODEMB_X86_H +#define ENCODEMB_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_berr(vp8_block_error_mmx); +extern prototype_mberr(vp8_mbblock_error_mmx); +extern prototype_mbuverr(vp8_mbuverror_mmx); +extern prototype_subb(vp8_subtract_b_mmx); +extern prototype_submby(vp8_subtract_mby_mmx); +extern prototype_submbuv(vp8_subtract_mbuv_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_berr +#define vp8_encodemb_berr vp8_block_error_mmx + +#undef vp8_encodemb_mberr +#define vp8_encodemb_mberr vp8_mbblock_error_mmx + +#undef vp8_encodemb_mbuverr +#define vp8_encodemb_mbuverr vp8_mbuverror_mmx + +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_mmx + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_mmx + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_berr(vp8_block_error_xmm); +extern prototype_mberr(vp8_mbblock_error_xmm); +extern prototype_mbuverr(vp8_mbuverror_xmm); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_berr +#define vp8_encodemb_berr vp8_block_error_xmm + +#undef vp8_encodemb_mberr +#define vp8_encodemb_mberr vp8_mbblock_error_xmm + +#undef vp8_encodemb_mbuverr +#define vp8_encodemb_mbuverr vp8_mbuverror_xmm + +#endif +#endif + + +#endif diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm new file mode 100644 index 000000000..194047155 --- /dev/null +++ b/vp8/encoder/x86/encodeopt.asm @@ -0,0 +1,393 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_xmm) +sym(vp8_block_error_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm7, xmm7 + + mov rdi, arg(1) ;dcoef_ptr + movdqa xmm3, [rsi] + + movdqa xmm4, [rdi] + movdqa xmm5, [rsi+16] + + movdqa xmm6, [rdi+16] + pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0 + + movdqa xmm2, xmm7 + psubw xmm5, xmm6 + + por xmm1, xmm2 + pmaddwd xmm5, xmm5 + + pcmpeqw xmm1, xmm7 + psubw xmm3, xmm4 + + pand xmm1, xmm3 + pmaddwd xmm1, xmm1 + + paddd xmm1, xmm5 + movdqa xmm0, xmm1 + + punpckldq xmm0, xmm7 + punpckhdq xmm1, xmm7 + + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + psrldq xmm0, 8 + paddd xmm0, xmm1 + + movd rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_mmx) +sym(vp8_block_error_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + movq mm3, [rsi] + + movq mm4, [rdi] + movq mm5, [rsi+8] + + movq mm6, [rdi+8] + pxor mm1, mm1 ; from movd mm1, dc ; dc =0 + + movq mm2, mm7 + psubw mm5, mm6 + + por mm1, mm2 + pmaddwd mm5, mm5 + + pcmpeqw mm1, mm7 + psubw mm3, mm4 + + pand mm1, mm3 + pmaddwd mm1, mm1 + + paddd mm1, mm5 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm3, mm5 + + paddd mm1, mm3 + movq mm0, mm1 + + psrlq mm1, 32 + paddd mm0, mm1 + + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_mmx_impl) +sym(vp8_mbblock_error_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + pxor mm2, mm2 + + movd mm1, dword ptr arg(2) ;dc + por mm1, mm2 + + pcmpeqw mm1, mm7 + mov rcx, 16 + +mberror_loop_mmx: + movq mm3, [rsi] + movq mm4, [rdi] + + movq mm5, [rsi+8] + movq mm6, [rdi+8] + + + psubw mm5, mm6 + pmaddwd mm5, mm5 + + psubw mm3, mm4 + pand mm3, mm1 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + add rsi, 32 + + add rdi, 32 + sub rcx, 1 + + jnz mberror_loop_mmx + + movq mm0, mm2 + psrlq mm2, 32 + + paddd mm0, mm2 + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_xmm_impl) +sym(vp8_mbblock_error_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm7, xmm7 + + mov rdi, arg(1) ;dcoef_ptr + pxor xmm2, xmm2 + + movd xmm1, dword ptr arg(2) ;dc + por xmm1, xmm2 + + pcmpeqw xmm1, xmm7 + mov rcx, 16 + +mberror_loop: + movdqa xmm3, [rsi] + movdqa xmm4, [rdi] + + movdqa xmm5, [rsi+16] + movdqa xmm6, [rdi+16] + + + psubw xmm5, xmm6 + pmaddwd xmm5, xmm5 + + psubw xmm3, xmm4 + pand xmm3, xmm1 + + pmaddwd xmm3, xmm3 + add rsi, 32 + + add rdi, 32 + + sub rcx, 1 + paddd xmm2, xmm5 + + paddd xmm2, xmm3 + jnz mberror_loop + + movdqa xmm0, xmm2 + punpckldq xmm0, xmm7 + + punpckhdq xmm2, xmm7 + paddd xmm0, xmm2 + + movdqa xmm1, xmm0 + psrldq xmm0, 8 + + paddd xmm0, xmm1 + movd rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_mmx_impl) +sym(vp8_mbuverror_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor mm7, mm7 + +mbuverror_loop_mmx: + + movq mm1, [rsi] + movq mm2, [rdi] + + psubw mm1, mm2 + pmaddwd mm1, mm1 + + + movq mm3, [rsi+8] + movq mm4, [rdi+8] + + psubw mm3, mm4 + pmaddwd mm3, mm3 + + + paddd mm7, mm1 + paddd mm7, mm3 + + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz mbuverror_loop_mmx + + movq mm0, mm7 + psrlq mm7, 32 + + paddd mm0, mm7 + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_xmm_impl) +sym(vp8_mbuverror_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor xmm7, xmm7 + +mbuverror_loop: + + movdqa xmm1, [rsi] + movdqa xmm2, [rdi] + + psubw xmm1, xmm2 + pmaddwd xmm1, xmm1 + + paddd xmm7, xmm1 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz mbuverror_loop + + pxor xmm0, xmm0 + movdqa xmm1, xmm7 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + paddd xmm1, xmm2 + + movdqa xmm2, xmm1 + + psrldq xmm1, 8 + paddd xmm1, xmm2 + + movd rax, xmm1 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 000000000..7d8620178 --- /dev/null +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_walsh4x4_sse2) +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movdqu xmm4, [rsi + 0] ;ip[4] ip[0] + movdqu xmm0, [rsi + 16] ;ip[12] ip[8] + + pxor xmm7, xmm7 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm5 ;ip[4] ip[0] + + paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm3 ;d1 a1 + punpckhqdq xmm6, xmm3 ;c1 b1 + + movdqa xmm1, xmm6 ;c1 b1 + paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + movdqa xmm0, xmm6 ;aka b2 a2 + movdqa xmm1, xmm5 ;aka d2 c2 + + pcmpgtw xmm0, xmm7 + pcmpgtw xmm1, xmm7 + + psrlw xmm0, 15 + psrlw xmm1, 15 + + paddw xmm6, xmm0 + paddw xmm5, xmm1 + + psraw xmm6, 1 + psraw xmm5, 1 + + ; a2 = a1 + b1; + ; b2 = c1 + d1; + ; c2 = a1 - b1; + ; d2 = d1 - c1; + ; a2 += (a2>0); + ; b2 += (b2>0); + ; c2 += (c2>0); + ; d2 += (d2>0); + ; op[0] = (a2)>>1; + ; op[4] = (b2)>>1; + ; op[8] = (c2)>>1; + ; op[12]= (d2)>>1; + + movdqu [rdi + 0], xmm6 + movdqu [rdi + 16], xmm5 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h new file mode 100644 index 000000000..5661491ad --- /dev/null +++ b/vp8/encoder/x86/mcomp_x86.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef MCOMP_X86_H +#define MCOMP_X86_H + +#if HAVE_SSE3 +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_search_full_search +#define vp8_search_full_search vp8_full_search_sadx3 + +#undef vp8_search_diamond_search +#define vp8_search_diamond_search vp8_diamond_search_sadx4 + +#endif +#endif + +#endif + diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c new file mode 100644 index 000000000..69617ca47 --- /dev/null +++ b/vp8/encoder/x86/preproc_mmx.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "memory.h" +#include "preproc.h" +#include "pragmas.h" + +/**************************************************************************** +* Macros +****************************************************************************/ +#define FRAMECOUNT 7 +#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) + +/**************************************************************************** +* Imports +****************************************************************************/ +extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +/**************************************************************************** +* Exported Global Variables +****************************************************************************/ +void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); + +/**************************************************************************** + * + * ROUTINE : temp_filter_wmt + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_wmt +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 8; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 8; i++) + d[byte+i] = s[byte+i]; + + byte += 8; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + mov eax, offset2 + mov edi, s // source pixels + pxor xmm1, xmm1 // accumulator + + pxor xmm7, xmm7 + + mov esi, frameptr // accumulator + pxor xmm2, xmm2 // count + + movq xmm3, QWORD PTR [edi] + + movq QWORD PTR [esi+8*eax], xmm3 + + punpcklbw xmm3, xmm2 // xmm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movq xmm4, QWORD PTR [esi] // get frame buffer values + punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels + movdqa xmm6, xmm4 // save the pixel values + psubsw xmm4, xmm3 // subtracted pixel values + pmullw xmm4, xmm4 // square xmm4 + movd xmm5, strength + psrlw xmm4, xmm5 // should be strength + pmullw xmm4, threes // 3 * modifier + movdqa xmm5, sixteens // 16s + psubusw xmm5, xmm4 // 16 - modifiers + movdqa xmm4, xmm5 // save the modifiers + pmullw xmm4, xmm6 // multiplier values + paddusw xmm1, xmm4 // accumulator + paddusw xmm2, xmm5 // count + add esi, 8 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movdqa counts, xmm2 + psrlw xmm2, 1 // divide count by 2 for rounding + paddusw xmm1, xmm2 // rounding added in + + mov frameptr, esi + + movdqa sums, xmm1 + } + + for (i = 0; i < 8; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 8; + d += 8; + byte += 8; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} + +/**************************************************************************** + * + * ROUTINE : temp_filter_mmx + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_mmx +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 4; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 4; i++) + d[byte+i] = s[byte+i]; + + byte += 4; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + + mov eax, offset2 + mov edi, s // source pixels + pxor mm1, mm1 // accumulator + pxor mm7, mm7 + + mov esi, frameptr // accumulator + pxor mm2, mm2 // count + + movd mm3, DWORD PTR [edi] + movd DWORD PTR [esi+4*eax], mm3 + + punpcklbw mm3, mm2 // mm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movd mm4, DWORD PTR [esi] // get frame buffer values + punpcklbw mm4, mm7 // mm4 frame buffer pixels + movq mm6, mm4 // save the pixel values + psubsw mm4, mm3 // subtracted pixel values + pmullw mm4, mm4 // square mm4 + movd mm5, strength + psrlw mm4, mm5 // should be strength + pmullw mm4, threes // 3 * modifier + movq mm5, sixteens // 16s + psubusw mm5, mm4 // 16 - modifiers + movq mm4, mm5 // save the modifiers + pmullw mm4, mm6 // multiplier values + paddusw mm1, mm4 // accumulator + paddusw mm2, mm5 // count + add esi, 4 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movq counts, mm2 + psrlw mm2, 1 // divide count by 2 for rounding + paddusw mm1, mm2 // rounding added in + + mov frameptr, esi + + movq sums, mm1 + + } + + for (i = 0; i < 4; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 4; + d += 4; + byte += 4; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm new file mode 100644 index 000000000..847fc6e37 --- /dev/null +++ b/vp8/encoder/x86/quantize_mmx.asm @@ -0,0 +1,438 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_mmx) +sym(vp8_fast_quantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + movq mm0, [rsi] + + mov rax, arg(1) ;zbin_ptr + movq mm1, [rax] + + movq mm3, mm0 + psraw mm0, 15 + + pxor mm3, mm0 + psubw mm3, mm0 ; abs + + movq mm2, mm3 + pcmpgtw mm1, mm2 + + pandn mm1, mm2 + movq mm3, mm1 + + mov rdx, arg(6) ;quant_ptr + movq mm1, [rdx] + + mov rcx, arg(5) ;round_ptr + movq mm2, [rcx] + + paddw mm3, mm2 + pmulhuw mm3, mm1 + + pxor mm3, mm0 + psubw mm3, mm0 ;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + movq mm0, mm3 + + movq [rdi], mm3 + + mov rax, arg(3) ;dequant_ptr + movq mm2, [rax] + + pmullw mm3, mm2 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax], mm3 + + ; next 8 + movq mm4, [rsi+8] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+8] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+8] + movq mm6, [rcx+8] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+8], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+8] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+8], mm7 + + + ; next 8 + movq mm4, [rsi+16] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+16] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+16] + movq mm6, [rcx+16] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+16], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+16] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+16], mm7 + + + ; next 8 + movq mm4, [rsi+24] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+24] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+24] + movq mm6, [rcx+24] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+24], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+24] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+24], mm7 + + + + mov rdi, arg(4) ;scan_mask + mov rsi, arg(2) ;qcoeff_ptr + + pxor mm5, mm5 + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rsi+8] + + movq mm2, [rdi] + movq mm3, [rdi+8]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + movq mm5, mm0 + + paddd mm5, mm1 + + movq mm0, [rsi+16] + movq mm1, [rsi+24] + + movq mm2, [rdi+16] + movq mm3, [rdi+24]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + paddd mm5, mm0 + + paddd mm5, mm1 + movq mm0, mm5 + + psrlq mm5, 32 + paddd mm0, mm5 + + ; eob adjustment begins here + movd rcx, mm0 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx ; rdx=-rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + ; Substitute the sse assembly for the old mmx mixed assembly/C. The + ; following is kept as reference + ; movd rcx, mm0 + ; bsr rax, rcx + ; + ; mov eob, rax + ; mov eee, rcx + ; + ;if(eee==0) + ;{ + ; eob=-1; + ;} + ;else if(eee<0) + ;{ + ; eob=15; + ;} + ;d->eob = eob+1; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_sse) +sym(vp8_fast_quantize_b_impl_sse): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + movdqa xmm0, [rsi] + + mov rax, arg(1) ;zbin_ptr + movdqa xmm1, [rax] + + movdqa xmm3, xmm0 + psraw xmm0, 15 + + pxor xmm3, xmm0 + psubw xmm3, xmm0 ; abs + + movdqa xmm2, xmm3 + pcmpgtw xmm1, xmm2 + + pandn xmm1, xmm2 + movdqa xmm3, xmm1 + + mov rdx, arg(6) ; quant_ptr + movdqa xmm1, [rdx] + + mov rcx, arg(5) ; round_ptr + movdqa xmm2, [rcx] + + paddw xmm3, xmm2 + pmulhuw xmm3, xmm1 + + pxor xmm3, xmm0 + psubw xmm3, xmm0 ;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + movdqa xmm0, xmm3 + + movdqa [rdi], xmm3 + + mov rax, arg(3) ;dequant_ptr + movdqa xmm2, [rax] + + pmullw xmm3, xmm2 + mov rax, arg(7) ;dqcoeff_ptr + + movdqa [rax], xmm3 + + ; next 8 + movdqa xmm4, [rsi+16] + + mov rax, arg(1) ;zbin_ptr + movdqa xmm5, [rax+16] + + movdqa xmm7, xmm4 + psraw xmm4, 15 + + pxor xmm7, xmm4 + psubw xmm7, xmm4 ; abs + + movdqa xmm6, xmm7 + pcmpgtw xmm5, xmm6 + + pandn xmm5, xmm6 + movdqa xmm7, xmm5 + + movdqa xmm5, [rdx+16] + movdqa xmm6, [rcx+16] + + + paddw xmm7, xmm6 + pmulhuw xmm7, xmm5 + + pxor xmm7, xmm4 + psubw xmm7, xmm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movdqa xmm1, xmm7 + movdqa [rdi+16], xmm7 + + mov rax, arg(3) ;dequant_ptr + movdqa xmm6, [rax+16] + + pmullw xmm7, xmm6 + mov rax, arg(7) ;dqcoeff_ptr + + movdqa [rax+16], xmm7 + mov rdi, arg(4) ;scan_mask + + pxor xmm7, xmm7 + movdqa xmm2, [rdi] + + movdqa xmm3, [rdi+16]; + pcmpeqw xmm0, xmm7 + + pcmpeqw xmm1, xmm7 + pcmpeqw xmm6, xmm6 + + pxor xmm0, xmm6 + pxor xmm1, xmm6 + + psrlw xmm0, 15 + psrlw xmm1, 15 + + pmaddwd xmm0, xmm2 + pmaddwd xmm1, xmm3 + + movq xmm2, xmm0 + movq xmm3, xmm1 + + psrldq xmm0, 8 + psrldq xmm1, 8 + + paddd xmm0, xmm1 + paddd xmm2, xmm3 + + paddd xmm0, xmm2 + movq xmm1, xmm0 + + psrldq xmm0, 4 + paddd xmm1, xmm0 + + movd rcx, xmm1 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm new file mode 100644 index 000000000..a825698e7 --- /dev/null +++ b/vp8/encoder/x86/sad_mmx.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vp8_sad16x16_mmx) +global sym(vp8_sad8x16_mmx) +global sym(vp8_sad8x8_mmx) +global sym(vp8_sad4x4_mmx) +global sym(vp8_sad16x8_mmx) + +%idefine QWORD + +;unsigned int vp8_sad16x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x16x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddw mm0, mm2 + paddw mm1, mm3 + + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + paddw mm7, mm1 + + cmp rsi, rcx + jne x16x16sad_mmx_loop + + + movq mm0, mm7 + + punpcklwd mm0, mm6 + punpckhwd mm7, mm6 + + paddw mm0, mm7 + movq mm7, mm0 + + + psrlq mm0, 32 + paddw mm7, mm0 + + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad8x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x8x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + paddw mm7, mm2 + cmp rsi, rcx + + jne x8x16sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x8x8sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + paddw mm0, mm2 + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + cmp rsi, rcx + + jne x8x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad4x4_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + pxor mm3, mm3 + + punpcklbw mm0, mm3 + punpckhbw mm2, mm3 + + paddw mm0, mm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm4, QWORD PTR [rsi] + movd mm5, QWORD PTR [rdi] + + movd mm6, QWORD PTR [rsi+rax] + movd mm7, QWORD PTR [rdi+rdx] + + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + + movq mm6, mm4 + psubusb mm4, mm5 + + psubusb mm5, mm6 + por mm4, mm5 + + movq mm5, mm4 + punpcklbw mm4, mm3 + + punpckhbw mm5, mm3 + paddw mm4, mm5 + + paddw mm0, mm4 + movq mm1, mm0 + + punpcklwd mm0, mm3 + punpckhwd mm1, mm3 + + paddw mm0, mm1 + movq mm1, mm0 + + psrlq mm0, 32 + paddw mm0, mm1 + + movd rax, mm0 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad16x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x16x8sad_mmx_loop: + + movq mm0, [rsi] + movq mm1, [rdi] + + movq mm2, [rsi+8] + movq mm3, [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpckhbw mm1, mm6 + + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + + + paddw mm0, mm2 + paddw mm1, mm3 + + paddw mm0, mm1 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne x16x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm new file mode 100644 index 000000000..53240bbf1 --- /dev/null +++ b/vp8/encoder/x86/sad_sse2.asm @@ -0,0 +1,329 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +;unsigned int vp8_sad16x16_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad16x16_wmt) +sym(vp8_sad16x16_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor xmm7, xmm7 + +x16x16sad_wmt_loop: + + movq xmm0, QWORD PTR [rsi] + movq xmm2, QWORD PTR [rsi+8] + + movq xmm1, QWORD PTR [rdi] + movq xmm3, QWORD PTR [rdi+8] + + movq xmm4, QWORD PTR [rsi+rax] + movq xmm5, QWORD PTR [rdi+rdx] + + + punpcklbw xmm0, xmm2 + punpcklbw xmm1, xmm3 + + psadbw xmm0, xmm1 + movq xmm6, QWORD PTR [rsi+rax+8] + + movq xmm3, QWORD PTR [rdi+rdx+8] + lea rsi, [rsi+rax*2] + + lea rdi, [rdi+rdx*2] + punpcklbw xmm4, xmm6 + + punpcklbw xmm5, xmm3 + psadbw xmm4, xmm5 + + paddw xmm7, xmm0 + paddw xmm7, xmm4 + + cmp rsi, rcx + jne x16x16sad_wmt_loop + + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd rax, xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad8x16_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int max_err) +global sym(vp8_sad8x16_wmt) +sym(vp8_sad8x16_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + + lea rcx, [rcx+rbx*8] + pxor mm7, mm7 + +x8x16sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x8x16sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, QWORD PTR [rsi+rbx] + movq mm3, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm7, mm0 + paddw mm7, mm2 + + cmp rsi, rcx + jne x8x16sad_wmt_loop + + movd rax, mm7 + +x8x16sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x8_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad8x8_wmt) +sym(vp8_sad8x8_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + pxor mm7, mm7 + +x8x8sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x8x8sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + psadbw mm0, mm1 + lea rsi, [rsi+rbx] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne x8x8sad_wmt_loop + + movd rax, mm7 +x8x8sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad4x4_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad4x4_wmt) +sym(vp8_sad4x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + psadbw mm0, mm1 + lea rsi, [rsi+rax*2] + + lea rdi, [rdi+rdx*2] + movd mm4, QWORD PTR [rsi] + + movd mm5, QWORD PTR [rdi] + movd mm6, QWORD PTR [rsi+rax] + + movd mm7, QWORD PTR [rdi+rdx] + punpcklbw mm4, mm6 + + punpcklbw mm5, mm7 + psadbw mm4, mm5 + + paddw mm0, mm4 + movd rax, mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x8_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad16x8_wmt) +sym(vp8_sad16x8_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + pxor mm7, mm7 + +x16x8sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x16x8sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, QWORD PTR [rsi+rbx] + movq mm5, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + movq mm1, QWORD PTR [rsi+rbx+8] + movq mm3, QWORD PTR [rdi+rdx+8] + + psadbw mm4, mm5 + psadbw mm1, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm0, mm2 + paddw mm4, mm1 + + paddw mm7, mm0 + paddw mm7, mm4 + + cmp rsi, rcx + jne x16x8sad_wmt_loop + + movd rax, mm7 + +x16x8sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm new file mode 100644 index 000000000..38cc02957 --- /dev/null +++ b/vp8/encoder/x86/sad_sse3.asm @@ -0,0 +1,939 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm5, [rdi] + lddqu xmm6, [rdi+1] + lddqu xmm7, [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rdi] + lddqu xmm2, [rdi+1] + lddqu xmm3, [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rdi+rdx] + lddqu xmm2, QWORD PTR [rdi+rdx+1] + lddqu xmm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 1 +%if %1 + movq mm0, [rsi] + movq mm5, [rdi] + movq mm6, [rdi+1] + movq mm7, [rdi+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, [rsi] + movq mm1, [rdi] + movq mm2, [rdi+1] + movq mm3, [rdi+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [rsi+rax] + movq mm1, QWORD PTR [rdi+rdx] + movq mm2, QWORD PTR [rdi+rdx+1] + movq mm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +%macro LOAD_X4_ADDRESSES 5 + mov %2, [%1+REG_SZ_BYTES*0] + mov %3, [%1+REG_SZ_BYTES*1] + + mov %4, [%1+REG_SZ_BYTES*2] + mov %5, [%1+REG_SZ_BYTES*3] +%endmacro + +%macro PROCESS_16X2X4 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm4, [rcx] + lddqu xmm5, [rdx] + lddqu xmm6, [rbx] + lddqu xmm7, [rdi] + + psadbw xmm4, xmm0 + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rcx] + lddqu xmm2, [rdx] + lddqu xmm3, [rbx] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm4, xmm1 + lddqu xmm1, [rdi] + paddw xmm5, xmm2 + paddw xmm6, xmm3 + + psadbw xmm1, xmm0 + paddw xmm7, xmm1 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rcx+rbp] + lddqu xmm2, QWORD PTR [rdx+rbp] + lddqu xmm3, QWORD PTR [rbx+rbp] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm4, xmm1 + lddqu xmm1, QWORD PTR [rdi+rbp] + paddw xmm5, xmm2 + paddw xmm6, xmm3 + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + psadbw xmm1, xmm0 + paddw xmm7, xmm1 + +%endmacro + +%macro PROCESS_8X2X4 1 +%if %1 + movq mm0, [rsi] + movq mm4, [rcx] + movq mm5, [rdx] + movq mm6, [rbx] + movq mm7, [rdi] + + psadbw mm4, mm0 + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, [rsi] + movq mm1, [rcx] + movq mm2, [rdx] + movq mm3, [rbx] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm4, mm1 + movq mm1, [rdi] + paddw mm5, mm2 + paddw mm6, mm3 + + psadbw mm1, mm0 + paddw mm7, mm1 +%endif + movq mm0, QWORD PTR [rsi+rax] + movq mm1, QWORD PTR [rcx+rbp] + movq mm2, QWORD PTR [rdx+rbp] + movq mm3, QWORD PTR [rbx+rbp] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm4, mm1 + movq mm1, QWORD PTR [rdi+rbp] + paddw mm5, mm2 + paddw mm6, mm3 + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + psadbw mm1, mm0 + paddw mm7, mm1 + +%endmacro + +;void int vp8_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x3_sse3) +sym(vp8_sad16x16x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x3_sse3) +sym(vp8_sad16x8x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x16x3_sse3) +sym(vp8_sad8x16x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X3 1 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + + mov rdi, arg(4) ;Results + + movd [rdi], mm5 + movd [rdi+4], mm6 + movd [rdi+8], mm7 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x8x3_sse3) +sym(vp8_sad8x8x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X3 1 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + + mov rdi, arg(4) ;Results + + movd [rdi], mm5 + movd [rdi+4], mm6 + movd [rdi+8], mm7 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad4x4x3_sse3) +sym(vp8_sad4x4x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, QWORD PTR [rdi+1] + movd mm5, QWORD PTR [rdi+2] + + movd mm2, QWORD PTR [rdi+rdx+1] + movd mm3, QWORD PTR [rdi+rdx+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm0, QWORD PTR [rsi] + movd mm2, QWORD PTR [rdi] + + movd mm3, QWORD PTR [rsi+rax] + movd mm6, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, QWORD PTR [rdi+1] + movd mm7, QWORD PTR [rdi+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, QWORD PTR [rdi+rdx+1] + movd mm6, QWORD PTR [rdi+rdx+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rdi, arg(4) ;Results + movd [rdi], mm1 + + movd [rdi+4], mm3 + movd [rdi+8], mm7 + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad16x16_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int max_err) +;%define lddqu movdqu +global sym(vp8_sad16x16_sse3) +sym(vp8_sad16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + + lea rcx, [rcx+rbx*8] + pxor mm7, mm7 + +vp8_sad16x16_sse3_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg vp8_sad16x16_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, QWORD PTR [rsi+rbx] + movq mm5, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + movq mm1, QWORD PTR [rsi+rbx+8] + movq mm3, QWORD PTR [rdi+rdx+8] + + psadbw mm4, mm5 + psadbw mm1, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm0, mm2 + paddw mm4, mm1 + + paddw mm7, mm0 + paddw mm7, mm4 + + cmp rsi, rcx + jne vp8_sad16x16_sse3_loop + + movd rax, mm7 + +vp8_sad16x16_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_sad16x16x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x4d_sse3) +sym(vp8_sad16x16x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_16X2X4 1 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movq xmm0, xmm4 + psrldq xmm4, 8 + + paddw xmm0, xmm4 + movd [rdi], xmm0 +;- + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+8], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+12], xmm0 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_sad16x8x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x4d_sse3) +sym(vp8_sad16x8x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_16X2X4 1 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movq xmm0, xmm4 + psrldq xmm4, 8 + + paddw xmm0, xmm4 + movd [rdi], xmm0 +;- + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+8], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+12], xmm0 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x16x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x16x4d_sse3) +sym(vp8_sad8x16x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_8X2X4 1 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movd [rdi], mm4 + movd [rdi+4], mm5 + movd [rdi+8], mm6 + movd [rdi+12], mm7 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x8x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x8x4d_sse3) +sym(vp8_sad8x8x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_8X2X4 1 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movd [rdi], mm4 + movd [rdi+4], mm5 + movd [rdi+8], mm6 + movd [rdi+12], mm7 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad4x4x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad4x4x4d_sse3) +sym(vp8_sad4x4x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rcx] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rcx+rbp] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, QWORD PTR [rdx] + movd mm5, QWORD PTR [rbx] + + movd mm6, QWORD PTR [rdi] + movd mm2, QWORD PTR [rdx+rbp] + + movd mm3, QWORD PTR [rbx+rbp] + movd mm7, QWORD PTR [rdi+rbp] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + punpcklbw mm6, mm7 + psadbw mm4, mm0 + + psadbw mm5, mm0 + psadbw mm6, mm0 + + + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + movd mm0, QWORD PTR [rsi] + movd mm2, QWORD PTR [rcx] + + movd mm3, QWORD PTR [rsi+rax] + movd mm7, QWORD PTR [rcx+rbp] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm7 + + movd mm3, QWORD PTR [rdx] + movd mm7, QWORD PTR [rbx] + + psadbw mm2, mm0 + mov rax, rbp + + pop rbp + mov rsi, arg(4) ;Results + + paddw mm1, mm2 + movd [rsi], mm1 + + movd mm2, QWORD PTR [rdx+rax] + movd mm1, QWORD PTR [rbx+rax] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm1 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + movd mm2, QWORD PTR [rdi] + movd mm1, QWORD PTR [rdi+rax] + + paddw mm3, mm4 + paddw mm7, mm5 + + movd [rsi+4], mm3 + punpcklbw mm2, mm1 + + movd [rsi+8], mm7 + psadbw mm2, mm0 + + paddw mm2, mm6 + movd [rsi+12], mm2 + + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm new file mode 100644 index 000000000..1bb956121 --- /dev/null +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -0,0 +1,367 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm5, [rdi] + lddqu xmm6, [rdi+1] + lddqu xmm7, [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rdi] + lddqu xmm2, [rdi+1] + lddqu xmm3, [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rdi+rdx] + lddqu xmm2, QWORD PTR [rdi+rdx+1] + lddqu xmm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, [rsi] + movdqa xmm4, [rdi] + movdqa xmm7, [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + movdqa xmm4, [rdi] + movdqa xmm3, [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + movdqa xmm4, QWORD PTR [rdi+rdx] + movdqa xmm3, QWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int vp8_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x3_ssse3) +sym(vp8_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp vp8_sad16x16x3_ssse3_skiptable +vp8_sad16x16x3_ssse3_jumptable: + dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump +vp8_sad16x16x3_ssse3_skiptable: + + call vp8_sad16x16x3_ssse3_do_jump +vp8_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3 + +vp8_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +vp8_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x3_ssse3) +sym(vp8_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp vp8_sad16x8x3_ssse3_skiptable +vp8_sad16x8x3_ssse3_jumptable: + dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump +vp8_sad16x8x3_ssse3_skiptable: + + call vp8_sad16x8x3_ssse3_do_jump +vp8_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3 + +vp8_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +vp8_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm new file mode 100644 index 000000000..ce3e61066 --- /dev/null +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -0,0 +1,431 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, +; unsigned short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_mmx_impl) +sym(vp8_subtract_b_mmx_impl) + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi], mm0 + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2],mm0 + + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) +global sym(vp8_subtract_mby_mmx) +sym(vp8_subtract_mby_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(1) ;src + mov rdi, arg(0) ;diff + + mov rax, arg(2) ;pred + movsxd rdx, dword ptr arg(3) ;stride + + mov rcx, 16 + pxor mm0, mm0 + +submby_loop: + + movq mm1, [rsi] + movq mm3, [rax] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi], mm1 + movq [rdi+8], mm2 + + + movq mm1, [rsi+8] + movq mm3, [rax+8] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi+16], mm1 + movq [rdi+24], mm2 + + + add rdi, 32 + add rax, 16 + + lea rsi, [rsi+rdx] + + sub rcx, 1 + jnz submby_loop + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +global sym(vp8_subtract_mbuv_mmx) +sym(vp8_subtract_mbuv_mmx) + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;short *udiff = diff + 256; + ;short *vdiff = diff + 320; + ;unsigned char *upred = pred + 256; + ;unsigned char *vpred = pred + 320; + + ;unsigned char *z = usrc; + ;unsigned short *diff = udiff; + ;unsigned char *Predictor= upred; + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(1) ;z = usrc + add rdi, 256*2 ;diff = diff + 256 (shorts) + add rax, 256 ;Predictor = pred + 256 + movsxd rdx, dword ptr arg(4) ;stride; + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + ;unsigned char *z = vsrc; + ;unsigned short *diff = vdiff; + ;unsigned char *Predictor= vpred; + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(2) ;z = usrc + add rdi, 320*2 ;diff = diff + 320 (shorts) + add rax, 320 ;Predictor = pred + 320 + movsxd rdx, dword ptr arg(4) ;stride; + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm new file mode 100644 index 000000000..d0da82ad4 --- /dev/null +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -0,0 +1,980 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) +global sym(vp8_get_mb_ss_mmx) +sym(vp8_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vp8_get8x8var_mmx) +sym(vp8_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int +;vp8_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vp8_get4x4var_mmx) +sym(vp8_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int +;vp8_get4x4sse_cs_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride +;) +global sym(vp8_get4x4sse_cs_mmx) +sym(vp8_get4x4sse_cs_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + push rbx + ; end prolog + + + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + ; Row 1 + movd mm0, [rax] ; Copy eight bytes to mm0 + movd mm1, [rbx] ; Copy eight bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 2 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm1, mm6 + punpcklbw mm0, mm6 ; unpack to higher prrcision + psubsw mm0, mm1 ; A-B (low order) to MM0 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + movq mm0, mm7 ; + psrlq mm7, 32 + + paddd mm0, mm7 + movd rax, mm0 + + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%define mmx_filter_shift 7 + +;void vp8_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil4x4_var_mmx) +sym(vp8_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + + paddw mm1, [mmx_bi_rd GLOBAL] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz filter_block2d_bil4x4_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + + +;void vp8_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil_var_mmx) +sym(vp8_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [mmx_bi_rd GLOBAL] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [mmx_bi_rd GLOBAL] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [mmx_bi_rd GLOBAL] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm2, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz filter_block2d_bil_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_get16x16pred_error_mmx +;( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride +;) +global sym(vp8_get16x16pred_error_mmx) +sym(vp8_get16x16pred_error_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;DWORD PTR [src_ptr] + mov rdi, arg(2) ;DWORD PTR [ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + + pxor mm0, mm0 ; clear xmm0 for unpack + pxor mm7, mm7 ; clear xmm7 for accumulating diffs + + pxor mm6, mm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16loop: + + movq mm1, [rsi] + movq mm2, [rdi] + + movq mm3, mm1 + movq mm4, mm2 + + punpcklbw mm1, mm0 + punpckhbw mm3, mm0 + + punpcklbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm2 + psubw mm3, mm4 + + paddw mm7, mm1 + pmaddwd mm1, mm1 + + paddw mm7, mm3 + pmaddwd mm3, mm3 + + paddd mm6, mm1 + paddd mm6, mm3 + + + movq mm1, [rsi+8] + movq mm2, [rdi+8] + + movq mm3, mm1 + movq mm4, mm2 + + punpcklbw mm1, mm0 + punpckhbw mm3, mm0 + + punpcklbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm2 + psubw mm3, mm4 + + paddw mm7, mm1 + pmaddwd mm1, mm1 + + paddw mm7, mm3 + pmaddwd mm3, mm3 + + paddd mm6, mm1 + paddd mm6, mm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16loop + + + movq mm1, mm6 + pxor mm6, mm6 + + pxor mm5, mm5 + punpcklwd mm6, mm7 + + punpckhwd mm5, mm7 + psrad mm5, 16 + + psrad mm6, 16 + paddd mm6, mm5 + + movq mm2, mm1 + psrlq mm1, 32 + + paddd mm2, mm1 + movq mm7, mm6 + + psrlq mm6, 32 + paddd mm6, mm7 + + movd DWORD PTR [rsp], mm6 ;Sum + movd DWORD PTR [rsp+4], mm2 ;SSE + + ; return (SSE-((Sum*Sum)>>8)); + movsxd rdx, dword ptr [rsp] + imul rdx, rdx + sar rdx, 8 + movsxd rax, dword ptr [rsp + 4] + sub rax, rdx + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm new file mode 100644 index 000000000..7e5ee284b --- /dev/null +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -0,0 +1,975 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + +;unsigned int vp8_get_mb_ss_sse2 +;( +; short *src_ptr +;) +global sym(vp8_get_mb_ss_sse2) +sym(vp8_get_mb_ss_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 1 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + mov rax, arg(0) ;[src_ptr] + mov rcx, 8 + pxor xmm4, xmm4 + +NEXTROW: + movdqa xmm0, [rax] + movdqa xmm1, [rax+16] + movdqa xmm2, [rax+32] + movdqa xmm3, [rax+48] + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + + paddd xmm0, xmm1 + paddd xmm2, xmm3 + paddd xmm4, xmm0 + paddd xmm4, xmm2 + + add rax, 0x40 + dec rcx + ja NEXTROW + + movdqa xmm3,xmm4 + psrldq xmm4,8 + paddd xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,4 + paddd xmm4,xmm3 + movd rax,xmm4 + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp8_get16x16var_sse2) +sym(vp8_get16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + + + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + + punpcklbw xmm2, xmm0 + punpckhbw xmm4, xmm0 + + + psubw xmm1, xmm2 + psubw xmm3, xmm4 + + paddw xmm7, xmm1 + pmaddwd xmm1, xmm1 + + paddw xmm7, xmm3 + pmaddwd xmm3, xmm3 + + paddd xmm6, xmm1 + paddd xmm6, xmm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16loop + + + movdqa xmm1, xmm6 + pxor xmm6, xmm6 + + pxor xmm5, xmm5 + punpcklwd xmm6, xmm7 + + punpckhwd xmm5, xmm7 + psrad xmm5, 16 + + psrad xmm6, 16 + paddd xmm6, xmm5 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddd xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddd xmm7, xmm6 + paddd xmm1, xmm2 + + mov rax, arg(5) ;[Sum] + mov rdi, arg(4) ;[SSE] + + movd DWORD PTR [rax], xmm7 + movd DWORD PTR [rdi], xmm1 + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get16x16pred_error_sse2 +;( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride +;) +global sym(vp8_get16x16pred_error_sse2) +sym(vp8_get16x16pred_error_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16peloop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + + punpcklbw xmm2, xmm0 + punpckhbw xmm4, xmm0 + + psubw xmm1, xmm2 + psubw xmm3, xmm4 + + paddw xmm7, xmm1 + pmaddwd xmm1, xmm1 + + paddw xmm7, xmm3 + pmaddwd xmm3, xmm3 + + paddd xmm6, xmm1 + paddd xmm6, xmm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16peloop + + + movdqa xmm1, xmm6 + pxor xmm6, xmm6 + + pxor xmm5, xmm5 + punpcklwd xmm6, xmm7 + + punpckhwd xmm5, xmm7 + psrad xmm5, 16 + + psrad xmm6, 16 + paddd xmm6, xmm5 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddd xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddd xmm7, xmm6 + paddd xmm1, xmm2 + + movd DWORD PTR [rsp], xmm7 ;Sum + movd DWORD PTR [rsp+4], xmm1 ;SSE + + ; return (SSE-((Sum*Sum)>>8)); + movsxd rdx, dword ptr [rsp] + imul rdx, rdx + sar rdx, 8 + movsxd rax, dword ptr [rsp + 4] + sub rax, rdx + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_get8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp8_get8x8var_sse2) +sym(vp8_get8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + movq xmm1, QWORD PTR [rsi] + movq xmm2, QWORD PTR [rdi] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + psubsw xmm1, xmm2 + paddw xmm7, xmm1 + + pmaddwd xmm1, xmm1 + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + movq xmm2, QWORD PTR[rsi + rax * 2] + movq xmm3, QWORD PTR[rdi + rdx * 2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + movq xmm2, QWORD PTR[rsi + rax *2] + movq xmm3, QWORD PTR[rdi + rdx *2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + movq xmm2, QWORD PTR[rsi + rax *2] + movq xmm3, QWORD PTR[rdi + rdx *2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + movdqa xmm6, xmm7 + punpcklwd xmm6, xmm0 + + punpckhwd xmm7, xmm0 + movdqa xmm2, xmm1 + + paddw xmm6, xmm7 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddw xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddw xmm7, xmm6 + paddd xmm1, xmm2 + + mov rax, arg(5) ;[Sum] + mov rdi, arg(4) ;[SSE] + + movd rdx, xmm7 + movsx rcx, dx + + mov dword ptr [rax], ecx + movd DWORD PTR [rdi], xmm1 + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block2d_bil_var_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared;; +; +;) +global sym(vp8_filter_block2d_bil_var_sse2) +sym(vp8_filter_block2d_bil_var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor xmm6, xmm6 ; + pxor xmm7, xmm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + + movq xmm3, QWORD PTR [rsi+1] ; + punpcklbw xmm1, xmm0 ; + + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 + ; + pmullw xmm3, [rax+16] ; + paddw xmm1, xmm3 ; + + paddw xmm1, [xmm_bi_rd GLOBAL] ; + psraw xmm1, xmm_filter_shift ; + + movdqa xmm5, xmm1 +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif +filter_block2d_bil_var_sse2_loop: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, [xmm_bi_rd GLOBAL] ; + + psraw xmm1, xmm_filter_shift ; + movdqa xmm3, xmm5 ; + + movdqa xmm5, xmm1 ; + pmullw xmm3, [rdx] ; + + pmullw xmm1, [rdx+16] ; + paddw xmm1, xmm3 ; + + paddw xmm1, [xmm_bi_rd GLOBAL] ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz filter_block2d_bil_var_sse2_loop ; + + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(7) ; sum + mov rdi, arg(8) ; sumsquared + + movd [rsi], mm2 ; xsum + movd [rdi], mm4 ; xxsum + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance16x_h_sse2) +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source +%else + add rsi, r8 +%endif + +vp8_half_horiz_vert_variance16x_h_1: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm2, QWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance16x_h_sse2) +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; +vp8_half_vert_variance16x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_vert_variance16x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; +vp8_half_horiz_variance16x16_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x16_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c new file mode 100644 index 000000000..4a5b25b0d --- /dev/null +++ b/vp8/encoder/x86/variance_mmx.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "pragmas.h" +#include "vpx_ports/mem.h" + +extern void filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *vp7_filter +); +extern void filter_block1d_v6_mmx +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *vp7_filter +); + +extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr); +extern unsigned int vp8_get8x8var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern unsigned int vp8_get4x4var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern unsigned int vp8_get4x4sse_cs_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride +); +extern void vp8_filter_block2d_bil4x4_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern unsigned int vp8_get16x16pred_error_mmx +( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride +); + + +void vp8_test_get_mb_ss(void) +{ + short zz[] = + { + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + }; + int s = 0, x = vp8_get_mb_ss_mmx(zz); + { + int y; + + for (y = 0; y < 256; y++) + s += (zz[y] * zz[y]); + } + + x += 0; +} + + +unsigned int vp8_get16x16var_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned *SSE, + unsigned *SUM +) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + + *SSE = var; + *SUM = avg; + return (var - ((avg * avg) >> 8)); + +} + + + + + +unsigned int vp8_variance4x4_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + *sse = var; + return (var - ((avg * avg) >> 4)); + +} + +unsigned int vp8_variance8x8_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + *sse = var; + + return (var - ((avg * avg) >> 6)); + +} + +unsigned int vp8_mse16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + + +unsigned int vp8_variance16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - ((avg * avg) >> 8)); +} + +unsigned int vp8_variance16x8_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + + +unsigned int vp8_variance8x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - ((avg * avg) >> 7)); + +} + + + + +/////////////////////////////////////////////////////////////////////////// +// the mmx function that does the bilinear filtering and var calculation // +// int one pass // +/////////////////////////////////////////////////////////////////////////// +DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +unsigned int vp8_sub_pixel_variance4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) + +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); + + +} + +unsigned int vp8_sub_pixel_mse16x16_mmx( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + return *sse; +} + +unsigned int vp8_sub_pixel_variance16x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 7)); +} + +unsigned int vp8_i_variance16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - ((avg * avg) >> 8)); + +} + +unsigned int vp8_i_variance8x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +unsigned int vp8_i_sub_pixel_variance16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + int f2soffset = (src_pixels_per_line >> 1); + int f2doffset = (dst_pixels_per_line >> 1); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset, src_pixels_per_line, + dst_ptr + f2doffset, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset + 8, src_pixels_per_line, + dst_ptr + f2doffset + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_i_sub_pixel_variance8x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + int f2soffset = (src_pixels_per_line >> 1); + int f2doffset = (dst_pixels_per_line >> 1); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset, src_pixels_per_line, + dst_ptr + f2doffset, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c new file mode 100644 index 000000000..ea80753bd --- /dev/null +++ b/vp8/encoder/x86/variance_sse2.c @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "pragmas.h" +#include "vpx_ports/mem.h" + +extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +extern unsigned int vp8_get4x4var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); + +unsigned int vp8_get_mb_ss_sse2 +( + short *src_ptr +); +unsigned int vp8_get16x16var_sse2 +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +unsigned int vp8_get16x16pred_error_sse2 +( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride +); +unsigned int vp8_get8x8var_sse2 +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +void vp8_filter_block2d_bil_var_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); + +DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); + +unsigned int vp8_variance4x4_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride) +{ + unsigned int var; + int avg; + + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + return (var - ((avg * avg) >> 4)); + +} + + + +unsigned int vp8_variance8x8_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride) +{ + unsigned int var; + int avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + + return (var - ((avg * avg) >> 6)); + +} + + +unsigned int vp8_variance16x16_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0; + int sum0; + + + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + *sse = sse0; + return (sse0 - ((sum0 * sum0) >> 8)); +} +unsigned int vp8_mse16x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + + unsigned int sse0; + int sum0; + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + *sse = sse0; + return sse0; + +} + + +unsigned int vp8_variance16x8_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +unsigned int vp8_variance8x16_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +/////////////////////////////////////////////////////////////////////////// +// the mmx function that does the bilinear filtering and var calculation // +// int one pass // +/////////////////////////////////////////////////////////////////////////// +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; +unsigned int vp8_sub_pixel_variance4x4_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum, &xxsum + ); + + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + // note we could avoid these if statements if the calling function + // just called the appropriate functions inside. + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum1, &xxsum1 + ); + } + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_mse16x16_wmt( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + return *sse; +} + +unsigned int vp8_sub_pixel_variance16x8_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum, &xxsum + ); + + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 7)); +} + +unsigned int vp8_i_variance16x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + + *sse = var; + return (var - ((avg * avg) >> 8)); + +} + +unsigned int vp8_i_variance8x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + + +unsigned int vp8_i_sub_pixel_variance16x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); +} + + +unsigned int vp8_i_sub_pixel_variance8x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h new file mode 100644 index 000000000..35fc90c48 --- /dev/null +++ b/vp8/encoder/x86/variance_x86.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VARIANCE_X86_H +#define VARIANCE_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_sad(vp8_sad4x4_mmx); +extern prototype_sad(vp8_sad8x8_mmx); +extern prototype_sad(vp8_sad8x16_mmx); +extern prototype_sad(vp8_sad16x8_mmx); +extern prototype_sad(vp8_sad16x16_mmx); +extern prototype_variance(vp8_variance4x4_mmx); +extern prototype_variance(vp8_variance8x8_mmx); +extern prototype_variance(vp8_variance8x16_mmx); +extern prototype_variance(vp8_variance16x8_mmx); +extern prototype_variance(vp8_variance16x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx); +extern prototype_getmbss(vp8_get_mb_ss_mmx); +extern prototype_variance(vp8_mse16x16_mmx); +extern prototype_sad(vp8_get16x16pred_error_mmx); +extern prototype_variance2(vp8_get8x8var_mmx); +extern prototype_variance2(vp8_get16x16var_mmx); +extern prototype_sad(vp8_get4x4sse_cs_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_mmx + +#undef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_mmx + +#undef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_mmx + +#undef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_mmx + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_mmx + +#undef vp8_variance_var4x4 +#define vp8_variance_var4x4 vp8_variance4x4_mmx + +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_mmx + +#undef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_mmx + +#undef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_mmx + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_mmx + +#undef vp8_variance_subpixvar4x4 +#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx + +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx + +#undef vp8_variance_subpixvar8x16 +#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx + +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx + +#undef vp8_variance_subpixmse16x16 +#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx + +#undef vp8_variance_getmbss +#define vp8_variance_getmbss vp8_get_mb_ss_mmx + +#undef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_mmx + +#undef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx + +#undef vp8_variance_get8x8var +#define vp8_variance_get8x8var vp8_get8x8var_mmx + +#undef vp8_variance_get16x16var +#define vp8_variance_get16x16var vp8_get16x16var_mmx + +#undef vp8_variance_get4x4sse_cs +#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_sad(vp8_sad4x4_wmt); +extern prototype_sad(vp8_sad8x8_wmt); +extern prototype_sad(vp8_sad8x16_wmt); +extern prototype_sad(vp8_sad16x8_wmt); +extern prototype_sad(vp8_sad16x16_wmt); +extern prototype_variance(vp8_variance4x4_wmt); +extern prototype_variance(vp8_variance8x8_wmt); +extern prototype_variance(vp8_variance8x16_wmt); +extern prototype_variance(vp8_variance16x8_wmt); +extern prototype_variance(vp8_variance16x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt); +extern prototype_getmbss(vp8_get_mb_ss_sse2); +extern prototype_variance(vp8_mse16x16_wmt); +extern prototype_sad(vp8_get16x16pred_error_sse2); +extern prototype_variance2(vp8_get8x8var_sse2); +extern prototype_variance2(vp8_get16x16var_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_wmt + +#undef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_wmt + +#undef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_wmt + +#undef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_wmt + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_wmt + +#undef vp8_variance_var4x4 +#define vp8_variance_var4x4 vp8_variance4x4_wmt + +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_wmt + +#undef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_wmt + +#undef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_wmt + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_wmt + +#undef vp8_variance_subpixvar4x4 +#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt + +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt + +#undef vp8_variance_subpixvar8x16 +#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt + +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt + +#undef vp8_variance_subpixmse16x16 +#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt + +#undef vp8_variance_getmbss +#define vp8_variance_getmbss vp8_get_mb_ss_sse2 + +#undef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_wmt + +#undef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2 + +#undef vp8_variance_get8x8var +#define vp8_variance_get8x8var vp8_get8x8var_sse2 + +#undef vp8_variance_get16x16var +#define vp8_variance_get16x16var vp8_get16x16var_sse2 + +#endif +#endif + + +#if HAVE_SSE3 +extern prototype_sad(vp8_sad16x16_sse3); +extern prototype_sad(vp8_sad16x8_sse3); +extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3); + +extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_sse3 + +#undef vp8_variance_sad16x16x3 +#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3 + +#undef vp8_variance_sad16x8x3 +#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3 + +#undef vp8_variance_sad8x16x3 +#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3 + +#undef vp8_variance_sad8x8x3 +#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3 + +#undef vp8_variance_sad4x4x3 +#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3 + +#undef vp8_variance_sad16x16x4d +#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3 + +#undef vp8_variance_sad16x8x4d +#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3 + +#undef vp8_variance_sad8x16x4d +#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3 + +#undef vp8_variance_sad8x8x4d +#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3 + +#undef vp8_variance_sad4x4x4d +#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3 + +#endif +#endif + + +#if HAVE_SSSE3 +extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); +extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad16x16x3 +#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3 + +#undef vp8_variance_sad16x8x3 +#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 + +#endif +#endif + +#endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c new file mode 100644 index 000000000..f1391ba8c --- /dev/null +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/x86.h" +#include "variance.h" +#include "onyx_int.h" + + +#if HAVE_MMX +void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_mmx(input, output, pitch); + vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); +} + +void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch) +{ + vp8_fast_fdct4x4_mmx(input, output , pitch); + vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch); +} + +int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *scan_mask, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) +{ + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + + d->eob = vp8_fast_quantize_b_impl_mmx( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + scan_mask, + + round_ptr, + quant_ptr, + dqcoeff_ptr + ); +} + +int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_mmx(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); +} + +void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); +} + +#endif + +#if HAVE_SSE2 +void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_wmt(input, output, pitch); + vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch); +} + +int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *scan_mask, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) +{ + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + + d->eob = vp8_fast_quantize_b_impl_sse( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + scan_mask, + + round_ptr, + quant_ptr, + dqcoeff_ptr + ); +} + +int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_xmm(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); +} + +#endif + +void vp8_arch_x86_encoder_init(VP8_COMP *cpi) +{ +#if CONFIG_RUNTIME_CPU_DETECT + int flags = x86_simd_caps(); + int mmx_enabled = flags & HAS_MMX; + int xmm_enabled = flags & HAS_SSE; + int wmt_enabled = flags & HAS_SSE2; + int SSE3Enabled = flags & HAS_SSE3; + int SSSE3Enabled = flags & HAS_SSSE3; + + /* Note: + * + * This platform can be built without runtime CPU detection as well. If + * you modify any of the function mappings present in this file, be sure + * to also update them in static mapings (<arch>/filename_<arch>.h) + */ + + /* Override default functions with fastest ones for this CPU. */ +#if HAVE_MMX + + if (mmx_enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; + cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; + + cpi->rtcd.encodemb.berr = vp8_block_error_mmx; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx; + cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx; + } + +#endif +#if HAVE_SSE2 + + if (wmt_enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; + cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; + +#if 0 + /* short SSE2 DCT currently disabled, does not match the MMX version */ + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt; +#endif + /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2; + + cpi->rtcd.encodemb.berr = vp8_block_error_xmm; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; + /* cpi->rtcd.encodemb.sub* not implemented for wmt */ + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; + } + +#endif +#if HAVE_SSE3 + + if (SSE3Enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; + cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3; + cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3; + cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; + cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; + cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; + cpi->rtcd.search.full_search = vp8_full_search_sadx3; + + cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; + cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; + cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; + cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; + cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; + cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; + } + +#endif +#if HAVE_SSSE3 + + if (SSSE3Enabled) + { + cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; + cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + } + +#endif +#endif +} |