diff options
Diffstat (limited to 'vp8/decoder/arm')
-rw-r--r-- | vp8/decoder/arm/armv5/dequantize_v5.asm | 51 | ||||
-rw-r--r-- | vp8/decoder/arm/armv6/dboolhuff_v6.asm | 162 | ||||
-rw-r--r-- | vp8/decoder/arm/armv6/dequantdcidct_v6.asm | 202 | ||||
-rw-r--r-- | vp8/decoder/arm/armv6/dequantidct_v6.asm | 183 | ||||
-rw-r--r-- | vp8/decoder/arm/armv6/dequantize_v6.asm | 68 | ||||
-rw-r--r-- | vp8/decoder/arm/dboolhuff_arm.h | 49 | ||||
-rw-r--r-- | vp8/decoder/arm/dequantize_arm.c | 48 | ||||
-rw-r--r-- | vp8/decoder/arm/dequantize_arm.h | 44 | ||||
-rw-r--r-- | vp8/decoder/arm/detokenizearm_sjl.c | 730 | ||||
-rw-r--r-- | vp8/decoder/arm/detokenizearm_v6.asm | 364 | ||||
-rw-r--r-- | vp8/decoder/arm/dsystemdependent.c | 44 | ||||
-rw-r--r-- | vp8/decoder/arm/neon/dboolhuff_neon.asm | 159 | ||||
-rw-r--r-- | vp8/decoder/arm/neon/dequantdcidct_neon.asm | 133 | ||||
-rw-r--r-- | vp8/decoder/arm/neon/dequantidct_neon.asm | 128 | ||||
-rw-r--r-- | vp8/decoder/arm/neon/dequantizeb_neon.asm | 33 |
15 files changed, 2398 insertions, 0 deletions
diff --git a/vp8/decoder/arm/armv5/dequantize_v5.asm b/vp8/decoder/arm/armv5/dequantize_v5.asm new file mode 100644 index 000000000..eb3f0307c --- /dev/null +++ b/vp8/decoder/arm/armv5/dequantize_v5.asm @@ -0,0 +1,51 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_armv5| + + AREA |.text|, CODE, READONLY ; name this block of code + +q RN r0 +dqc RN r1 +cnt RN r2 + +;void dequantize_b_armv5(short *Q, short *DQC) +|vp8_dequantize_b_armv5| PROC + stmdb sp!, {r4, lr} + ldr r3, [q] + ldr r4, [dqc], #8 + + mov cnt, #4 +dequant_loop + smulbb lr, r3, r4 + smultt r12, r3, r4 + + ldr r3, [q, #4] + ldr r4, [dqc, #-4] + + strh lr, [q], #2 + strh r12, [q], #2 + + smulbb lr, r3, r4 + smultt r12, r3, r4 + + subs cnt, cnt, #1 + ldrne r3, [q, #4] + ldrne r4, [dqc], #8 + + strh lr, [q], #2 + strh r12, [q], #2 + + bne dequant_loop + + ldmia sp!, {r4, pc} + ENDP ;|vp8_dequantize_b_arm| + + END diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm new file mode 100644 index 000000000..143e33e46 --- /dev/null +++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm @@ -0,0 +1,162 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_decode_value_v6| + EXPORT |vp8dx_start_decode_v6| + EXPORT |vp8dx_stop_decode_v6| + EXPORT |vp8dx_decode_bool_v6| + + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + +br RN r0 +prob RN r1 +bits RN r1 + AREA |.text|, CODE, READONLY ; name this block of code + +; int z = 0; +; int bit; +; for ( bit=bits-1; bit>=0; bit-- ) +; { +; z |= (vp8dx_decode_bool(br, 0x80)<<bit); +; } +; return z; + +;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits ) +|vp8_decode_value_v6| PROC + stmdb sp!, {r4 - r6, lr} + mov r4, br + mov r5, bits + mov r6, #0 + + subs r5, r5, #1 + bmi decode_value_exit + +decode_value_loop + mov prob, #0x80 + mov br, r4 + bl vp8dx_decode_bool_v6_internal ; needed for conversion to s file + orr r6, r6, r0, lsl r5 + subs r5, r5, #1 + bpl decode_value_loop + +decode_value_exit + mov r0, r6 + ldmia sp!, {r4 - r6, pc} + ENDP ; |vp8_decode_value_v6| + + +;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source ) +|vp8dx_start_decode_v6| PROC + stmdb sp!, {r4 - r5, lr} + mov r2, #0 + mov r3, #255 + + str r2, [br, #bool_decoder_lowvalue] + str r3, [br, #bool_decoder_range] + str r1, [br, #bool_decoder_buffer] + + mov r3, #8 + mov r2, #4 + str r3, [br, #bool_decoder_count] + str r2, [br, #bool_decoder_pos] + + ldrb r2, [r1, #3] + ldrb r3, [r1, #2] + ldrb r4, [r1, #1] + ldrb r5, [r1] + + orr r1, r2, r3, lsl #8 + orr r1, r1, r4, lsl #16 + orr r1, r1, r5, lsl #24 + + str r1, [br, #bool_decoder_value] + + ldmia sp!, {r4 - r5, pc} + ENDP ; |vp8dx_start_decode_v6| + + +;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc ); +|vp8dx_stop_decode_v6| PROC + mov pc, lr + ENDP ; |vp8dx_stop_decode_v6| + + +; bigsplit RN r1 +; buffer_v RN r1 +; count_v RN r4 +; range_v RN r2 +; value_v RN r3 +; pos_v RN r5 +; split RN r6 +; bit RN lr +;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability ) +|vp8dx_decode_bool_v6| PROC +vp8dx_decode_bool_v6_internal + stmdb sp!, {r4 - r6, lr} + + ldr r2, [br, #bool_decoder_range] + ldr r3, [br, #bool_decoder_value] + + mov r6, r2, lsl #8 + sub r6, r6, #256 ; split = 1 + (((range-1) * probability) >> 8) + mov r12, #1 + smlawb r6, r6, prob, r12 + + mov lr, #0 + subs r5, r3, r6, lsl #24 + + ;cmp r3, r1 + movhs lr, #1 + movhs r3, r5 + subhs r2, r2, r6 + movlo r2, r6 + + cmp r2, #0x80 + blt range_less_0x80 + ;strd r2, r3, [br, #bool_decoder_range] + str r2, [br, #bool_decoder_range] + str r3, [br, #bool_decoder_value] + mov r0, lr + ldmia sp!, {r4 - r6, pc} + +range_less_0x80 + ldr r5, [br, #bool_decoder_pos] + ldr r1, [br, #bool_decoder_buffer] + ldr r4, [br, #bool_decoder_count] + add r1, r1, r5 + + clz r12, r2 + sub r12, r12, #24 + subs r4, r4, r12 + ldrleb r6, [r1], #1 + mov r2, r2, lsl r12 + mov r3, r3, lsl r12 + addle r4, r4, #8 + rsble r12, r4, #8 + addle r5, r5, #1 + orrle r3, r3, r6, lsl r12 + + ;strd r2, r3, [br, #bool_decoder_range] + ;strd r4, r5, [br, #bool_decoder_count] + str r2, [br, #bool_decoder_range] + str r3, [br, #bool_decoder_value] + str r4, [br, #bool_decoder_count] + str r5, [br, #bool_decoder_pos] + + mov r0, lr + + ldmia sp!, {r4 - r6, pc} + ENDP ; |vp8dx_decode_bool_v6| + + END diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm new file mode 100644 index 000000000..3daa9b34f --- /dev/null +++ b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_dc_idct_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code +;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc) +|vp8_dequant_dc_idct_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r6, [sp, #36] ;load Dc + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r0, [sp] + + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + mov r12, #3 + +dequant_dc_idct_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne dequant_dc_idct_loop + + sub r0, r0, #32 + mov r1, r2 + mov r2, r3 + +; short_idct4x4llm_v6_dual + + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual_11 + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual_11 ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual_22 + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual_22 ; + + +;vpx_memset + ldr r0, [sp] + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + + ENDP ;|vp8_dequant_dc_idct_v68| + + END diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm new file mode 100644 index 000000000..61bb48d04 --- /dev/null +++ b/vp8/decoder/arm/armv6/dequantidct_v6.asm @@ -0,0 +1,183 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_idct_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code +;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch) +|vp8_dequant_idct_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r0, [sp] + + mov r12, #4 + +dequant_idct_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne dequant_idct_loop + + sub r0, r0, #32 + mov r1, r2 + mov r2, r3 + +; short_idct4x4llm_v6_dual + + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual_1 + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual_1 ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual_2 + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual_2 ; + ; + +;vpx_memset + ldr r0, [sp] + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + + ENDP ;|vp8_dequant_idct_v6| + + END diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/decoder/arm/armv6/dequantize_v6.asm new file mode 100644 index 000000000..95e38594f --- /dev/null +++ b/vp8/decoder/arm/armv6/dequantize_v6.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_v6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------- +;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_v6| PROC + stmdb sp!, {r4-r9, lr} + + ldr r3, [r0] ;load Q + ldr r4, [r1] ;load DQC + ldr r5, [r0, #4] + ldr r6, [r1, #4] + + mov r12, #2 ;loop counter + +dequant_loop + smulbb r7, r3, r4 ;multiply + smultt r8, r3, r4 + smulbb r9, r5, r6 + smultt lr, r5, r6 + + ldr r3, [r0, #8] + ldr r4, [r1, #8] + ldr r5, [r0, #12] + ldr r6, [r1, #12] + + strh r7, [r2], #2 ;store result + smulbb r7, r3, r4 ;multiply + strh r8, [r2], #2 + smultt r8, r3, r4 + strh r9, [r2], #2 + smulbb r9, r5, r6 + strh lr, [r2], #2 + smultt lr, r5, r6 + + subs r12, r12, #1 + + add r0, r0, #16 + add r1, r1, #16 + + ldrne r3, [r0] + strh r7, [r2], #2 ;store result + ldrne r4, [r1] + strh r8, [r2], #2 + ldrne r5, [r0, #4] + strh r9, [r2], #2 + ldrne r6, [r1, #4] + strh lr, [r2], #2 + + bne dequant_loop + + ldmia sp!, {r4-r9, pc} + ENDP ;|vp8_dequantize_b_loop_v6| + + END diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h new file mode 100644 index 000000000..495004f9c --- /dev/null +++ b/vp8/decoder/arm/dboolhuff_arm.h @@ -0,0 +1,49 @@ +#ifndef DBOOLHUFF_ARM_H +#define DBOOLHUFF_ARM_H + +/* JLK + * There are currently no arm-optimized versions of + * these functions. As they are implemented, they + * can be uncommented below and added to + * arm/dsystemdependent.c + * + * The existing asm code is likely so different as + * to be useless. However, its been left (for now) + * for reference. + */ +/* +#if HAVE_ARMV6 +#undef vp8_dbool_start +#define vp8_dbool_start vp8dx_start_decode_v6 + +#undef vp8_dbool_stop +#define vp8_dbool_stop vp8dx_stop_decode_v6 + +#undef vp8_dbool_fill +#define vp8_dbool_fill vp8_bool_decoder_fill_v6 + +#undef vp8_dbool_debool +#define vp8_dbool_debool vp8_decode_bool_v6 + +#undef vp8_dbool_devalue +#define vp8_dbool_devalue vp8_decode_value_v6 +#endif // HAVE_ARMV6 + +#if HAVE_ARMV7 +#undef vp8_dbool_start +#define vp8_dbool_start vp8dx_start_decode_neon + +#undef vp8_dbool_stop +#define vp8_dbool_stop vp8dx_stop_decode_neon + +#undef vp8_dbool_fill +#define vp8_dbool_fill vp8_bool_decoder_fill_neon + +#undef vp8_dbool_debool +#define vp8_dbool_debool vp8_decode_bool_neon + +#undef vp8_dbool_devalue +#define vp8_dbool_devalue vp8_decode_value_neon +#endif // HAVE_ARMV7 +*/ +#endif // DBOOLHUFF_ARM_H diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c new file mode 100644 index 000000000..54006a921 --- /dev/null +++ b/vp8/decoder/arm/dequantize_arm.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "dequantize.h" +#include "predictdc.h" +#include "idct.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_ARMV7 +extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ); +#endif + +#if HAVE_ARMV6 +extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +#endif + +#if HAVE_ARMV7 + +void vp8_dequantize_b_neon(BLOCKD *d) +{ + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + short *DQC = &d->dequant[0][0]; + + vp8_dequantize_b_loop_neon(Q, DQC, DQ); +} +#endif + +#if HAVE_ARMV6 +void vp8_dequantize_b_v6(BLOCKD *d) +{ + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + short *DQC = &d->dequant[0][0]; + + vp8_dequantize_b_loop_v6(Q, DQC, DQ); +} +#endif diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h new file mode 100644 index 000000000..c8a61a4a7 --- /dev/null +++ b/vp8/decoder/arm/dequantize_arm.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef DEQUANTIZE_ARM_H +#define DEQUANTIZE_ARM_H + +#if HAVE_ARMV6 +extern prototype_dequant_block(vp8_dequantize_b_v6); +extern prototype_dequant_idct(vp8_dequant_idct_v6); +extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6); + +#undef vp8_dequant_block +#define vp8_dequant_block vp8_dequantize_b_v6 + +#undef vp8_dequant_idct +#define vp8_dequant_idct vp8_dequant_idct_v6 + +#undef vp8_dequant_idct_dc +#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6 +#endif + +#if HAVE_ARMV7 +extern prototype_dequant_block(vp8_dequantize_b_neon); +extern prototype_dequant_idct(vp8_dequant_idct_neon); +extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon); + +#undef vp8_dequant_block +#define vp8_dequant_block vp8_dequantize_b_neon + +#undef vp8_dequant_idct +#define vp8_dequant_idct vp8_dequant_idct_neon + +#undef vp8_dequant_idct_dc +#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon +#endif + +#endif diff --git a/vp8/decoder/arm/detokenizearm_sjl.c b/vp8/decoder/arm/detokenizearm_sjl.c new file mode 100644 index 000000000..c714452a6 --- /dev/null +++ b/vp8/decoder/arm/detokenizearm_sjl.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "type_aliases.h" +#include "blockd.h" +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#define BR_COUNT 8 +#define BOOL_DATA UINT8 + +#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES +//ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X}; +DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X}; + +#define EOB_CONTEXT_NODE 0 +#define ZERO_CONTEXT_NODE 1 +#define ONE_CONTEXT_NODE 2 +#define LOW_VAL_CONTEXT_NODE 3 +#define TWO_CONTEXT_NODE 4 +#define THREE_CONTEXT_NODE 5 +#define HIGH_LOW_CONTEXT_NODE 6 +#define CAT_ONE_CONTEXT_NODE 7 +#define CAT_THREEFOUR_CONTEXT_NODE 8 +#define CAT_THREE_CONTEXT_NODE 9 +#define CAT_FIVE_CONTEXT_NODE 10 + + + + +DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) = +{ + { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ZERO_TOKEN + { 1, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ONE_TOKEN + { 2, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //TWO_TOKEN + { 3, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //THREE_TOKEN + { 4, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //FOUR_TOKEN + { 5, 0, { 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY1 + { 7, 1, { 145, 165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY2 + { 11, 2, { 140, 148, 173, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY3 + { 19, 3, { 135, 140, 155, 176, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY4 + { 35, 4, { 130, 134, 141, 157, 180, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY5 + { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0 } }, //DCT_VAL_CATEGORY6 + { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, // EOB TOKEN +}; + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above +}; + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +void vp8_reset_mb_tokens_context(MACROBLOCKD *x) +{ + ENTROPY_CONTEXT **const A = x->above_context; + ENTROPY_CONTEXT(* const L)[4] = x->left_context; + + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int i; + + for (i = 0; i < 24; i++) + { + + a = A[ vp8_block2context[i] ] + vp8_block2above[i]; + l = L[ vp8_block2context[i] ] + vp8_block2left[i]; + + *a = *l = 0; + } + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + a = A[Y2CONTEXT] + vp8_block2above[24]; + l = L[Y2CONTEXT] + vp8_block2left[24]; + *a = *l = 0; + } + + +} + +#define ONYXBLOCK2CONTEXT_OFFSET 0 +#define ONYXBLOCK2LEFT_OFFSET 25 +#define ONYXBLOCK2ABOVE_OFFSET 50 + +DECLARE_ALIGNED(16, const static unsigned char, norm[128]) = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +void init_detokenizer(VP8D_COMP *dx) +{ + const VP8_COMMON *const oc = & dx->common; + MACROBLOCKD *x = & dx->mb; + + dx->detoken.norm_ptr = (unsigned char *)norm; + dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree; + dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove; + dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x; + dx->detoken.scan = (int *)vp8_default_zig_zag1d; + dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2; + + dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; + + + dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]); + dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]); + dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]); + dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]); + +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + + +//shift = norm[range]; \ +// shift = norm_ptr[range]; \ + +#define NORMALIZE \ + /*if(range < 0x80)*/ \ + { \ + shift = detoken->norm_ptr[range]; \ + range <<= shift; \ + value <<= shift; \ + count -= shift; \ + if(count <= 0) \ + { \ + count += BR_COUNT ; \ + value |= (*bufptr) << (BR_COUNT-count); \ + bufptr++; \ + } \ + } +#if 1 +#define DECODE_AND_APPLYSIGN(value_to_sign) \ + split = (range + 1) >> 1; \ + if ( (value >> 24) < split ) \ + { \ + range = split; \ + v= value_to_sign; \ + } \ + else \ + { \ + range = range-split; \ + value = value-(split<<24); \ + v = -value_to_sign; \ + } \ + range +=range; \ + value +=value; \ + if (!--count) \ + { \ + count = BR_COUNT; \ + value |= *bufptr; \ + bufptr++; \ + } + +#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \ + { \ + split = 1 + ((( probability*(range-1) ) )>> 8); \ + if ( (value >> 24) < split ) \ + { \ + range = split; \ + NORMALIZE \ + goto branch; \ + } \ + value -= (split<<24); \ + range = range - split; \ + NORMALIZE \ + } + +#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \ + { \ + split = 1 + ((( probability*(range-1) ) ) >> 8); \ + if ( (value >> 24) < split ) \ + { \ + range = split; \ + NORMALIZE \ + Prob = coef_probs; \ + ++c; \ + Prob += vp8_coef_bands_x[c]; \ + goto branch; \ + } \ + value -= (split<<24); \ + range = range - split; \ + NORMALIZE \ + } + +#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \ + DECODE_AND_APPLYSIGN(val) \ + Prob = coef_probs + (ENTROPY_NODES*2); \ + if(c < 15){\ + qcoeff_ptr [ scan[c] ] = (INT16) v; \ + ++c; \ + goto DO_WHILE; }\ + qcoeff_ptr [ scan[15] ] = (INT16) v; \ + goto BLOCK_FINISHED; + + +#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\ + split = 1 + (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \ + if(value >= (split<<24))\ + {\ + range = range-split;\ + value = value-(split<<24);\ + val += ((UINT16)1<<bits_count);\ + }\ + else\ + {\ + range = split;\ + }\ + NORMALIZE +#endif + +#if 0 +int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) +{ + ENTROPY_CONTEXT **const A = x->above_context; + ENTROPY_CONTEXT(* const L)[4] = x->left_context; + const VP8_COMMON *const oc = & dx->common; + + BOOL_DECODER *bc = x->current_bc; + + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int i; + + int eobtotal = 0; + + register int count; + + BOOL_DATA *bufptr; + register unsigned int range; + register unsigned int value; + const int *scan; + register unsigned int shift; + UINT32 split; + INT16 *qcoeff_ptr; + + UINT8 *coef_probs; + int type; + int stop; + INT16 val, bits_count; + INT16 c; + INT16 t; + INT16 v; + vp8_prob *Prob; + + //int *scan; + type = 3; + i = 0; + stop = 16; + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + i = 24; + stop = 24; + type = 1; + qcoeff_ptr = &x->qcoeff[24*16]; + scan = vp8_default_zig_zag1d; + eobtotal -= 16; + } + else + { + scan = vp8_default_zig_zag1d; + qcoeff_ptr = &x->qcoeff[0]; + } + + count = bc->count; + range = bc->range; + value = bc->value; + bufptr = &bc->buffer[bc->pos]; + + + coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); + +BLOCK_LOOP: + a = A[ vp8_block2context[i] ] + vp8_block2above[i]; + l = L[ vp8_block2context[i] ] + vp8_block2left[i]; + c = (INT16)(!type); + + VP8_COMBINEENTROPYCONTEXTS(t, *a, *l); + Prob = coef_probs; + Prob += t * ENTROPY_NODES; + +DO_WHILE: + Prob += vp8_coef_bands_x[c]; + DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED); + +CHECK_0_: + DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_); + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val; + bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length; + + do + { + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count); + bits_count -- ; + } + while (bits_count >= 0); + + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +CAT_FIVE_CONTEXT_NODE_0_: + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val; + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +CAT_THREEFOUR_CONTEXT_NODE_0_: + DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_); + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val; + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +CAT_THREE_CONTEXT_NODE_0_: + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val; + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +HIGH_LOW_CONTEXT_NODE_0_: + DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_); + + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val; + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1); + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +CAT_ONE_CONTEXT_NODE_0_: + val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val; + DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); + +LOW_VAL_CONTEXT_NODE_0_: + DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_); + DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_); + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4); + +THREE_CONTEXT_NODE_0_: + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3); + +TWO_CONTEXT_NODE_0_: + DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2); + +ONE_CONTEXT_NODE_0_: + DECODE_AND_APPLYSIGN(1); + Prob = coef_probs + ENTROPY_NODES; + + if (c < 15) + { + qcoeff_ptr [ scan[c] ] = (INT16) v; + ++c; + goto DO_WHILE; + } + + qcoeff_ptr [ scan[15] ] = (INT16) v; +BLOCK_FINISHED: + t = ((x->Block[i].eob = c) != !type); // any nonzero data? + eobtotal += x->Block[i].eob; + *a = *l = t; + qcoeff_ptr += 16; + + i++; + + if (i < stop) + goto BLOCK_LOOP; + + if (i == 25) + { + scan = vp8_default_zig_zag1d;//x->scan_order1d; + type = 0; + i = 0; + stop = 16; + coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); + qcoeff_ptr = &x->qcoeff[0]; + goto BLOCK_LOOP; + } + + if (i == 16) + { + type = 2; + coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); + stop = 24; + goto BLOCK_LOOP; + } + + bc->count = count; + bc->value = value; + bc->range = range; + bc->pos = bufptr - bc->buffer; + return eobtotal; + +} +//#endif +#else +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +#if 0 +//uses relative offsets + +const vp8_tree_index vp8_coef_tree_x[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 1, /* 0 = EOB */ + -ZERO_TOKEN, 1, /* 1 = ZERO */ + -ONE_TOKEN, 1, /* 2 = ONE */ + 2, 5, /* 3 = LOW_VAL */ + -TWO_TOKEN, 1, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 2, 3, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 2, 3, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; +#endif + +#define _SCALEDOWN 8 //16 //8 + +int vp8_decode_mb_tokens_v5(DETOK *detoken, int type); + +int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type) +{ + BOOL_DECODER *bc = detoken->current_bc; + + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int i; + + register int count; + + BOOL_DATA *bufptr; + register unsigned int range; + register unsigned int value; + register unsigned int shift; + UINT32 split; + INT16 *qcoeff_ptr; + + UINT8 *coef_probs; +// int type; + int stop; + INT16 c; + INT16 t; + INT16 v; + vp8_prob *Prob; + + + +// type = 3; + i = 0; + stop = 16; + qcoeff_ptr = detoken->qcoeff_start_ptr; + +// if( detoken->mode != B_PRED && detoken->mode != SPLITMV) + if (type == 1) + { + i += 24; + stop += 8; //24; +// type = 1; + qcoeff_ptr += 24 * 16; +// eobtotal-=16; + } + + count = bc->count; + range = bc->range; + value = bc->value; + bufptr = &bc->buffer[bc->pos]; + + + coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); + +BLOCK_LOOP: + a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ]; + l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ]; + c = !type; + a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET]; + l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET]; + + //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + //Dest = ((A)!=0) + ((B)!=0); + + VP8_COMBINEENTROPYCONTEXTS(t, *a, *l); + + Prob = coef_probs; + Prob += t * ENTROPY_NODES; + t = 0; + + do + { + + { +// onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x; + + Prob += detoken->ptr_onyx_coef_bands_x[c]; + + GET_TOKEN_START: + + do + { + split = 1 + (((range - 1) * (Prob[t>>1])) >> 8); + + if (value >> 24 >= split) + { + range = range - split; + value = value - (split << 24); + t += 1; + + //used to eliminate else branch + split = range; + } + + range = split; + + t = detoken->vp8_coef_tree_ptr[ t ]; + + NORMALIZE + + } + while (t > 0) ; + } + GET_TOKEN_STOP: + + if (t == -DCT_EOB_TOKEN) + { + break; + } + + v = -t; + + if (v > FOUR_TOKEN) + { + INT16 bits_count; + TOKENEXTRABITS *teb_ptr; + +// teb_ptr = &onyxd_token_extra_bits2[t]; +// teb_ptr = &onyxd_token_extra_bits2[v]; + teb_ptr = &detoken->teb_base_ptr[v]; + + + v = teb_ptr->min_val; + bits_count = teb_ptr->Length; + + do + { + split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN); + + if ((value >> 24) >= split) + { + range = range - split; + value = value - (split << 24); + v += ((UINT16)1 << bits_count); + + //used to eliminate else branch + split = range; + } + + range = split; + + NORMALIZE + + bits_count -- ; + } + while (bits_count >= 0); + } + + Prob = coef_probs; + + if (t) + { + split = 1 + (((range - 1) * vp8_prob_half) >> 8); + + if ((value >> 24) >= split) + { + range = range - split; + value = value - (split << 24); + v = (v ^ -1) + 1; /* negate w/out conditionals */ + + //used to eliminate else branch + split = range; + } + + range = split; + + NORMALIZE + Prob += ENTROPY_NODES; + + if (t < -ONE_TOKEN) + Prob += ENTROPY_NODES; + + t = -2; + } + + //if t is zero, we will skip the eob table check + t += 2; + qcoeff_ptr [detoken->scan [c] ] = (INT16) v; + + } + while (++c < 16); + + if (t != -DCT_EOB_TOKEN) + { + --c; + } + + t = ((detoken->eob[i] = c) != !type); // any nonzero data? +// eobtotal += detoken->eob[i]; + *a = *l = t; + qcoeff_ptr += 16; + + i++; + + if (i < stop) + goto BLOCK_LOOP; + + if (i == 25) + { + type = 0; + i = 0; + stop = 16; +// coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); + coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); + qcoeff_ptr = detoken->qcoeff_start_ptr; + goto BLOCK_LOOP; + } + + if (i == 16) + { + type = 2; +// coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); + coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); + stop = 24; + goto BLOCK_LOOP; + } + + bc->count = count; + bc->value = value; + bc->range = range; + bc->pos = bufptr - bc->buffer; + return 0; +} +//#if 0 +int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) +{ +// const ONYX_COMMON * const oc = & dx->common; + int eobtotal = 0; + int i, type; + /* + dx->detoken.norm_ptr = norm; + dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree; + dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE; + dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x; + dx->detoken.scan = default_zig_zag1d; + dx->detoken.teb_base_ptr = onyxd_token_extra_bits2; + + dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; + + dx->detoken.A = x->above_context; + dx->detoken.L = x->left_context; + + dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]); + dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]); + dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]); + dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]); + */ + + dx->detoken.current_bc = x->current_bc; + dx->detoken.A = x->above_context; + dx->detoken.L = x->left_context; + + type = 3; + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + type = 1; + eobtotal -= 16; + } + + vp8_decode_mb_tokens_v5(&dx->detoken, type); + + for (i = 0; i < 25; i++) + { + x->Block[i].eob = dx->detoken.eob[i]; + eobtotal += dx->detoken.eob[i]; + } + + return eobtotal; +} +#endif diff --git a/vp8/decoder/arm/detokenizearm_v6.asm b/vp8/decoder/arm/detokenizearm_v6.asm new file mode 100644 index 000000000..4d87ee5bd --- /dev/null +++ b/vp8/decoder/arm/detokenizearm_v6.asm @@ -0,0 +1,364 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_decode_mb_tokens_v5| + + AREA |.text|, CODE, READONLY ; name this block of code + + INCLUDE vpx_asm_offsets.asm + +l_qcoeff EQU 0 +l_i EQU 4 +l_type EQU 8 +l_stop EQU 12 +l_c EQU 16 +l_l_ptr EQU 20 +l_a_ptr EQU 24 +l_bc EQU 28 +l_coef_ptr EQU 32 +l_stacksize EQU 64 + + +;; constant offsets -- these should be created at build time +c_onyxblock2left_offset EQU 25 +c_onyxblock2above_offset EQU 50 +c_entropy_nodes EQU 11 +c_dct_eob_token EQU 11 + +|vp8_decode_mb_tokens_v5| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #l_stacksize + mov r7, r1 + mov r9, r0 ;DETOK *detoken + + ldr r1, [r9, #detok_current_bc] + ldr r0, [r9, #detok_qcoeff_start_ptr] + mov r11, #0 + mov r3, #0x10 + + cmp r7, #1 + addeq r11, r11, #24 + addeq r3, r3, #8 + addeq r0, r0, #3, 24 + + str r0, [sp, #l_qcoeff] + str r11, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + str r1, [sp, #l_bc] + + add lr, r9, r7, lsl #2 + + ldr r2, [r1, #bool_decoder_buffer] + ldr r3, [r1, #bool_decoder_pos] + + ldr r10, [lr, #detok_coef_probs] + ldr r5, [r1, #bool_decoder_count] + ldr r6, [r1, #bool_decoder_range] + ldr r4, [r1, #bool_decoder_value] + add r8, r2, r3 + + str r10, [sp, #l_coef_ptr] + + + ;align 4 +BLOCK_LOOP + ldr r3, [r9, #detok_ptr_onyxblock2context_leftabove] + ldr r2, [r9, #DETOK_A] + ldr r1, [r9, #DETOK_L] + ldrb r12, [r3, +r11] ; detoken->ptr_onyxblock2context_leftabove[i] + + cmp r7, #0 ; check type + moveq r7, #1 + movne r7, #0 + + ldr r0, [r2, +r12, lsl #2] ; a + add r1, r1, r12, lsl #4 + add r3, r3, r11 + + ldrb r2, [r3, #c_onyxblock2above_offset] + ldrb r3, [r3, #c_onyxblock2left_offset] + mov lr, #c_entropy_nodes +;; ;++ + + ldr r2, [r0, +r2, lsl #2]! + add r3, r1, r3, lsl #2 + str r3, [sp, #l_l_ptr] + ldr r3, [r3] + + cmp r2, #0 + movne r2, #1 + cmp r3, #0 + addne r2, r2, #1 + + str r0, [sp, #l_a_ptr] + smlabb r0, r2, lr, r10 + mov r1, #0 ; t = 0 + str r7, [sp, #l_c] + + ;align 4 +COEFF_LOOP + ldr r3, [r9, #detok_ptr_onyx_coef_bands_x] + ldr lr, [r9, #detok_onyx_coef_tree_ptr] + +;;the following two lines are used if onyx_coef_bands_x is UINT16 +;; add r3, r3, r7, lsl #1 +;; ldrh r3, [r3] + +;;the following line is used if onyx_coef_bands_x is UINT8 + ldrb r3, [r7, +r3] + + +;; ;++ +;; pld [r8] + ;++ + add r0, r0, r3 + + ;align 4 +get_token_loop + ldrb r2, [r0, +r1, asr #1] + mov r3, r6, lsl #8 + sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8) + mov r10, #1 + + smlawb r2, r3, r2, r10 + ldrb r12, [r8] ;load cx data byte in stall slot + ;++ + + subs r3, r4, r2, lsl #24 ;x = value-(split<<24) + addhs r1, r1, #1 ;t += 1 + movhs r4, r3 ;update value + subhs r2, r6, r2 ;range = range - split + movlo r6, r2 + +;;; ldrsbhs r1, [r1, +lr] + ldrsb r1, [r1, +lr] + + +;; use branch for short pipelines ??? +;; cmp r2, #0x80 +;; bcs |$LN22@decode_mb_to| + + clz r3, r2 + sub r3, r3, #24 + subs r5, r5, r3 + mov r6, r2, lsl r3 + mov r4, r4, lsl r3 + +;; use branch for short pipelines ??? +;; bgt |$LN22@decode_mb_to| + + addle r5, r5, #8 + rsble r3, r5, #8 + addle r8, r8, #1 + orrle r4, r4, r12, lsl r3 + +;;|$LN22@decode_mb_to| + + cmp r1, #0 + bgt get_token_loop + + cmn r1, #c_dct_eob_token ;if(t == -DCT_EOB_TOKEN) + beq END_OF_BLOCK + + rsb lr, r1, #0 ;v = -t; + + cmp lr, #4 ;if(v > FOUR_TOKEN) + ble SKIP_EXTRABITS + + ldr r3, [r9, #detok_teb_base_ptr] + mov r11, #1 + add r7, r3, lr, lsl #4 + + ldrsh lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val + ldrsh r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length + +extrabits_loop + add r3, r0, r7 + + ldrb r2, [r3, #4] + mov r3, r6, lsl #8 + sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8) + mov r10, #1 + + smlawb r2, r3, r2, r10 + ldrb r12, [r8] + ;++ + + subs r10, r4, r2, lsl #24 ;x = value-(split<<24) + movhs r4, r10 ;update value + subhs r2, r6, r2 ;range = range - split + addhs lr, lr, r11, lsl r0 ;v += ((UINT16)1<<bits_count) + movlo r6, r2 ;range = split + + +;; use branch for short pipelines ??? +;; cmp r2, #0x80 +;; bcs |$LN10@decode_mb_to| + + clz r3, r2 + sub r3, r3, #24 + subs r5, r5, r3 + mov r6, r2, lsl r3 ;range + mov r4, r4, lsl r3 ;value + + addle r5, r5, #8 + addle r8, r8, #1 + rsble r3, r5, #8 + orrle r4, r4, r12, lsl r3 + +;;|$LN10@decode_mb_to| + subs r0, r0, #1 + bpl extrabits_loop + + +SKIP_EXTRABITS + ldr r11, [sp, #l_qcoeff] + ldr r0, [sp, #l_coef_ptr] + + cmp r1, #0 ;check for nonzero token + beq SKIP_EOB_CHECK ;if t is zero, we will skip the eob table chec + + sub r3, r6, #1 ;range - 1 + ;++ + mov r3, r3, lsl #7 ; *= onyx_prob_half (128) + ;++ + mov r3, r3, lsr #8 + add r2, r3, #1 ;split + + subs r3, r4, r2, lsl #24 ;x = value-(split<<24) + movhs r4, r3 ;update value + subhs r2, r6, r2 ;range = range - split + mvnhs r3, lr + addhs lr, r3, #1 ;v = (v ^ -1) + 1 + movlo r6, r2 ;range = split + +;; use branch for short pipelines ??? +;; cmp r2, #0x80 +;; bcs |$LN6@decode_mb_to| + + clz r3, r2 + sub r3, r3, #24 + subs r5, r5, r3 + mov r6, r2, lsl r3 + mov r4, r4, lsl r3 + ldrleb r2, [r8], #1 + addle r5, r5, #8 + rsble r3, r5, #8 + orrle r4, r4, r2, lsl r3 + +;;|$LN6@decode_mb_to| + add r0, r0, #0xB + + cmn r1, #1 + + addlt r0, r0, #0xB + + mvn r1, #1 + +SKIP_EOB_CHECK + ldr r7, [sp, #l_c] + ldr r3, [r9, #detok_scan] + add r1, r1, #2 + cmp r7, #(0x10 - 1) ;assume one less for now.... increment below + + ldr r3, [r3, +r7, lsl #2] + add r7, r7, #1 + add r3, r11, r3, lsl #1 + + str r7, [sp, #l_c] + strh lr, [r3] + + blt COEFF_LOOP + + sub r7, r7, #1 ;if(t != -DCT_EOB_TOKEN) --c + +END_OF_BLOCK + ldr r3, [sp, #l_type] + ldr r10, [sp, #l_coef_ptr] + ldr r0, [sp, #l_qcoeff] + ldr r11, [sp, #l_i] + ldr r12, [sp, #l_stop] + + cmp r3, #0 + moveq r1, #1 + movne r1, #0 + add r3, r11, r9 + + cmp r7, r1 + strb r7, [r3, #detok_eob] + + ldr r7, [sp, #l_l_ptr] + ldr r2, [sp, #l_a_ptr] + movne r3, #1 + moveq r3, #0 + + add r0, r0, #0x20 + add r11, r11, #1 + str r3, [r7] + str r3, [r2] + str r0, [sp, #l_qcoeff] + str r11, [sp, #l_i] + + cmp r11, r12 ;i >= stop ? + ldr r7, [sp, #l_type] + mov lr, #0xB + + blt BLOCK_LOOP + + cmp r11, #0x19 + bne ln2_decode_mb_to + + ldr r12, [r9, #detok_qcoeff_start_ptr] + ldr r10, [r9, #detok_coef_probs] + mov r7, #0 + mov r3, #0x10 + str r12, [sp, #l_qcoeff] + str r7, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] + + b BLOCK_LOOP + +ln2_decode_mb_to + cmp r11, #0x10 + bne ln1_decode_mb_to + + ldr r10, [r9, #0x30] + + mov r7, #2 + mov r3, #0x18 + + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] + b BLOCK_LOOP + +ln1_decode_mb_to + ldr r2, [sp, #l_bc] + mov r0, #0 + nop + + ldr r3, [r2, #bool_decoder_buffer] + str r5, [r2, #bool_decoder_count] + str r4, [r2, #bool_decoder_value] + sub r3, r8, r3 + str r3, [r2, #bool_decoder_pos] + str r6, [r2, #bool_decoder_range] + + add sp, sp, #l_stacksize + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_decode_mb_tokens_v5| + + END diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c new file mode 100644 index 000000000..455c83a9c --- /dev/null +++ b/vp8/decoder/arm/dsystemdependent.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "blockd.h" +#include "pragmas.h" +#include "postproc.h" +#include "dboolhuff.h" +#include "dequantize.h" +#include "onyxd_int.h" + +void vp8_dmachine_specific_config(VP8D_COMP *pbi) +{ +#if CONFIG_RUNTIME_CPU_DETECT + pbi->mb.rtcd = &pbi->common.rtcd; +#if HAVE_ARMV7 + pbi->dequant.block = vp8_dequantize_b_neon; + pbi->dequant.idct = vp8_dequant_idct_neon; + pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon; + pbi->dboolhuff.start = vp8dx_start_decode_c; + pbi->dboolhuff.stop = vp8dx_stop_decode_c; + pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; + pbi->dboolhuff.debool = vp8dx_decode_bool_c; + pbi->dboolhuff.devalue = vp8dx_decode_value_c; + +#elif HAVE_ARMV6 + pbi->dequant.block = vp8_dequantize_b_v6; + pbi->dequant.idct = vp8_dequant_idct_v6; + pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6; + pbi->dboolhuff.start = vp8dx_start_decode_c; + pbi->dboolhuff.stop = vp8dx_stop_decode_c; + pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; + pbi->dboolhuff.debool = vp8dx_decode_bool_c; + pbi->dboolhuff.devalue = vp8dx_decode_value_c; +#endif +#endif +} diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm new file mode 100644 index 000000000..7ec62a3d8 --- /dev/null +++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm @@ -0,0 +1,159 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_decode_value_neon| + EXPORT |vp8dx_start_decode_neon| + EXPORT |vp8dx_stop_decode_neon| + EXPORT |vp8dx_decode_bool_neon| + + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + + AREA |.text|, CODE, READONLY ; name this block of code + +; int z = 0; +; int bit; +; for ( bit=bits-1; bit>=0; bit-- ) +; { +; z |= (vp8dx_decode_bool(br, 0x80)<<bit); +; } +; return z; + +;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits ) +|vp8_decode_value_neon| PROC + stmdb sp!, {r4 - r6, lr} + mov r4, r0 + mov r5, r1 + mov r6, #0 + + subs r5, r5, #1 + bmi decode_value_exit + +decode_value_loop + mov r1, #0x80 + mov r0, r4 + bl vp8dx_decode_bool_neon_internal ; needed for conversion to s file + orr r6, r6, r0, lsl r5 + subs r5, r5, #1 + bpl decode_value_loop + +decode_value_exit + mov r0, r6 + ldmia sp!, {r4 - r6, pc} + ENDP ; |vp8_decode_value_neon| + + +;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source ) +|vp8dx_start_decode_neon| PROC + stmdb sp!, {r4 - r5, lr} + mov r2, #0 + mov r3, #255 + + str r2, [r0, #bool_decoder_lowvalue] + str r3, [r0, #bool_decoder_range] + str r1, [r0, #bool_decoder_buffer] + + mov r3, #8 + mov r2, #4 + str r3, [r0, #bool_decoder_count] + str r2, [r0, #bool_decoder_pos] + + ldrb r2, [r1, #3] + ldrb r3, [r1, #2] + ldrb r4, [r1, #1] + ldrb r5, [r1] + + orr r1, r2, r3, lsl #8 + orr r1, r1, r4, lsl #16 + orr r1, r1, r5, lsl #24 + + str r1, [r0, #bool_decoder_value] + + ldmia sp!, {r4 - r5, pc} + ENDP ; |vp8dx_start_decode_neon| + + +;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc ); +|vp8dx_stop_decode_neon| PROC + mov pc, lr + ENDP ; |vp8dx_stop_decode_neon| + + +; bigsplit RN r1 +; buffer_v RN r1 +; count_v RN r4 +; range_v RN r2 +; value_v RN r3 +; pos_v RN r5 +; split RN r6 +; bit RN lr +;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability ) +|vp8dx_decode_bool_neon| PROC +vp8dx_decode_bool_neon_internal +;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8 +;before memory allocation + stmdb sp!, {r4 - r5, lr} + + ldr r2, [r0, #bool_decoder_range] ;load range (r2), value(r3) + ldr r3, [r0, #bool_decoder_value] + ;ldrd r2, r3, [r0, #bool_decoder_range] ;ldrd costs 2 cycles + ; + + mov r4, r2, lsl #8 + sub r4, r4, #256 + mov r12, #1 + + smlawb r4, r4, r1, r12 ;split = 1 + (((range-1) * probability) >> 8) + + mov lr, r0 + mov r0, #0 ;bit = 0 + ; + subs r5, r3, r4, lsl #24 + + subhs r2, r2, r4 ;range = br->range-split + movlo r2, r4 ;range = split + movhs r0, #1 ;bit = 1 + movhs r3, r5 ;value = value-bigsplit + + cmp r2, #0x80 + blt range_less_0x80 + strd r2, r3, [lr, #bool_decoder_range] ;store result + + ldmia sp!, {r4 - r5, pc} + +range_less_0x80 + + ldrd r4, r5, [lr, #bool_decoder_count] ;load count, pos, buffer + ldr r1, [lr, #bool_decoder_buffer] + + clz r12, r2 + add r1, r1, r5 + + sub r12, r12, #24 + subs r4, r4, r12 ;count -= shift + mov r2, r2, lsl r12 ;range <<= shift + mov r3, r3, lsl r12 ;value <<= shift + addle r4, r4, #8 ;count += 8 + ldrleb r12, [r1], #1 ;br->buffer[br->pos] + + rsble r1, r4, #8 ;-count + addle r5, r5, #1 ;br->pos++ + orrle r3, r3, r12, lsl r1 ;value |= (br->buffer[br->pos]) << (-count) + + strd r2, r3, [lr, #bool_decoder_range] ;store result + strd r4, r5, [lr, #bool_decoder_count] + + ldmia sp!, {r4 - r5, pc} + ENDP ; |vp8dx_decode_bool_neon| + + END diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequantdcidct_neon.asm new file mode 100644 index 000000000..3392f2c2b --- /dev/null +++ b/vp8/decoder/arm/neon/dequantdcidct_neon.asm @@ -0,0 +1,133 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_dc_idct_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc); +; r0 short *input, +; r1 short *dq, +; r2 short *output, +; r3 int pitch, +; (stack) int Dc +|vp8_dequant_dc_idct_neon| PROC + vld1.16 {q3, q4}, [r0] + vld1.16 {q5, q6}, [r1] + + ldr r1, [sp] ;load Dc from stack + + ldr r12, _dcidct_coeff_ + + vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon + vmul.i16 q2, q4, q6 + + vmov.16 d2[0], r1 + +;|short_idct4x4llm_neon| PROC + vld1.16 {d0}, [r12] + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + +; memset(input, 0, 32) -- 32bytes + vmov.i16 q14, #0 + + vswp d3, d4 + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vmov q15, q14 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vst1.16 {q14, q15}, [r0] + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + add r1, r2, r3 + add r12, r1, r3 + add r0, r12, r3 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vst1.16 {d2}, [r2] + vst1.16 {d3}, [r1] + vst1.16 {d4}, [r12] + vst1.16 {d5}, [r0] + + bx lr + + ENDP + +;----------------- + AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_dcidct_coeff_ + DCD dcidct_coeff +dcidct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequantidct_neon.asm new file mode 100644 index 000000000..bba4d5dfb --- /dev/null +++ b/vp8/decoder/arm/neon/dequantidct_neon.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_idct_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch); +; r0 short *input, +; r1 short *dq, +; r2 short *output, +; r3 int pitch, +|vp8_dequant_idct_neon| PROC + vld1.16 {q3, q4}, [r0] + vld1.16 {q5, q6}, [r1] + + ldr r12, _didct_coeff_ + + vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon + vmul.i16 q2, q4, q6 + +;|short_idct4x4llm_neon| PROC + vld1.16 {d0}, [r12] + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + +; memset(input, 0, 32) -- 32bytes + vmov.i16 q14, #0 + + vswp d3, d4 + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vmov q15, q14 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vst1.16 {q14, q15}, [r0] + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + add r1, r2, r3 + add r12, r1, r3 + add r0, r12, r3 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vst1.16 {d2}, [r2] + vst1.16 {d3}, [r1] + vst1.16 {d4}, [r12] + vst1.16 {d5}, [r0] + + bx lr + + ENDP + +;----------------- + AREA didct4x4_dat, DATA, READWRITE ;read/write by default +;Data section with name data_area is specified. DCD reserves space in memory for 48 data. +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +_didct_coeff_ + DCD didct_coeff +didct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/decoder/arm/neon/dequantizeb_neon.asm new file mode 100644 index 000000000..1bde94607 --- /dev/null +++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm @@ -0,0 +1,33 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_neon| PROC + vld1.16 {q0, q1}, [r0] + vld1.16 {q2, q3}, [r1] + + vmul.i16 q4, q0, q2 + vmul.i16 q5, q1, q3 + + vst1.16 {q4, q5}, [r2] + + bx lr + + ENDP + + END |