summaryrefslogtreecommitdiff
path: root/vp8/decoder/arm
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/decoder/arm')
-rw-r--r--vp8/decoder/arm/armv5/dequantize_v5.asm51
-rw-r--r--vp8/decoder/arm/armv6/dboolhuff_v6.asm162
-rw-r--r--vp8/decoder/arm/armv6/dequantdcidct_v6.asm202
-rw-r--r--vp8/decoder/arm/armv6/dequantidct_v6.asm183
-rw-r--r--vp8/decoder/arm/armv6/dequantize_v6.asm68
-rw-r--r--vp8/decoder/arm/dboolhuff_arm.h49
-rw-r--r--vp8/decoder/arm/dequantize_arm.c48
-rw-r--r--vp8/decoder/arm/dequantize_arm.h44
-rw-r--r--vp8/decoder/arm/detokenizearm_sjl.c730
-rw-r--r--vp8/decoder/arm/detokenizearm_v6.asm364
-rw-r--r--vp8/decoder/arm/dsystemdependent.c44
-rw-r--r--vp8/decoder/arm/neon/dboolhuff_neon.asm159
-rw-r--r--vp8/decoder/arm/neon/dequantdcidct_neon.asm133
-rw-r--r--vp8/decoder/arm/neon/dequantidct_neon.asm128
-rw-r--r--vp8/decoder/arm/neon/dequantizeb_neon.asm33
15 files changed, 2398 insertions, 0 deletions
diff --git a/vp8/decoder/arm/armv5/dequantize_v5.asm b/vp8/decoder/arm/armv5/dequantize_v5.asm
new file mode 100644
index 000000000..eb3f0307c
--- /dev/null
+++ b/vp8/decoder/arm/armv5/dequantize_v5.asm
@@ -0,0 +1,51 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_armv5|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+q RN r0
+dqc RN r1
+cnt RN r2
+
+;void dequantize_b_armv5(short *Q, short *DQC)
+|vp8_dequantize_b_armv5| PROC
+ stmdb sp!, {r4, lr}
+ ldr r3, [q]
+ ldr r4, [dqc], #8
+
+ mov cnt, #4
+dequant_loop
+ smulbb lr, r3, r4
+ smultt r12, r3, r4
+
+ ldr r3, [q, #4]
+ ldr r4, [dqc, #-4]
+
+ strh lr, [q], #2
+ strh r12, [q], #2
+
+ smulbb lr, r3, r4
+ smultt r12, r3, r4
+
+ subs cnt, cnt, #1
+ ldrne r3, [q, #4]
+ ldrne r4, [dqc], #8
+
+ strh lr, [q], #2
+ strh r12, [q], #2
+
+ bne dequant_loop
+
+ ldmia sp!, {r4, pc}
+ ENDP ;|vp8_dequantize_b_arm|
+
+ END
diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
new file mode 100644
index 000000000..143e33e46
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -0,0 +1,162 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_decode_value_v6|
+ EXPORT |vp8dx_start_decode_v6|
+ EXPORT |vp8dx_stop_decode_v6|
+ EXPORT |vp8dx_decode_bool_v6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+br RN r0
+prob RN r1
+bits RN r1
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+; int z = 0;
+; int bit;
+; for ( bit=bits-1; bit>=0; bit-- )
+; {
+; z |= (vp8dx_decode_bool(br, 0x80)<<bit);
+; }
+; return z;
+
+;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
+|vp8_decode_value_v6| PROC
+ stmdb sp!, {r4 - r6, lr}
+ mov r4, br
+ mov r5, bits
+ mov r6, #0
+
+ subs r5, r5, #1
+ bmi decode_value_exit
+
+decode_value_loop
+ mov prob, #0x80
+ mov br, r4
+ bl vp8dx_decode_bool_v6_internal ; needed for conversion to s file
+ orr r6, r6, r0, lsl r5
+ subs r5, r5, #1
+ bpl decode_value_loop
+
+decode_value_exit
+ mov r0, r6
+ ldmia sp!, {r4 - r6, pc}
+ ENDP ; |vp8_decode_value_v6|
+
+
+;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
+|vp8dx_start_decode_v6| PROC
+ stmdb sp!, {r4 - r5, lr}
+ mov r2, #0
+ mov r3, #255
+
+ str r2, [br, #bool_decoder_lowvalue]
+ str r3, [br, #bool_decoder_range]
+ str r1, [br, #bool_decoder_buffer]
+
+ mov r3, #8
+ mov r2, #4
+ str r3, [br, #bool_decoder_count]
+ str r2, [br, #bool_decoder_pos]
+
+ ldrb r2, [r1, #3]
+ ldrb r3, [r1, #2]
+ ldrb r4, [r1, #1]
+ ldrb r5, [r1]
+
+ orr r1, r2, r3, lsl #8
+ orr r1, r1, r4, lsl #16
+ orr r1, r1, r5, lsl #24
+
+ str r1, [br, #bool_decoder_value]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ; |vp8dx_start_decode_v6|
+
+
+;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
+|vp8dx_stop_decode_v6| PROC
+ mov pc, lr
+ ENDP ; |vp8dx_stop_decode_v6|
+
+
+; bigsplit RN r1
+; buffer_v RN r1
+; count_v RN r4
+; range_v RN r2
+; value_v RN r3
+; pos_v RN r5
+; split RN r6
+; bit RN lr
+;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
+|vp8dx_decode_bool_v6| PROC
+vp8dx_decode_bool_v6_internal
+ stmdb sp!, {r4 - r6, lr}
+
+ ldr r2, [br, #bool_decoder_range]
+ ldr r3, [br, #bool_decoder_value]
+
+ mov r6, r2, lsl #8
+ sub r6, r6, #256 ; split = 1 + (((range-1) * probability) >> 8)
+ mov r12, #1
+ smlawb r6, r6, prob, r12
+
+ mov lr, #0
+ subs r5, r3, r6, lsl #24
+
+ ;cmp r3, r1
+ movhs lr, #1
+ movhs r3, r5
+ subhs r2, r2, r6
+ movlo r2, r6
+
+ cmp r2, #0x80
+ blt range_less_0x80
+ ;strd r2, r3, [br, #bool_decoder_range]
+ str r2, [br, #bool_decoder_range]
+ str r3, [br, #bool_decoder_value]
+ mov r0, lr
+ ldmia sp!, {r4 - r6, pc}
+
+range_less_0x80
+ ldr r5, [br, #bool_decoder_pos]
+ ldr r1, [br, #bool_decoder_buffer]
+ ldr r4, [br, #bool_decoder_count]
+ add r1, r1, r5
+
+ clz r12, r2
+ sub r12, r12, #24
+ subs r4, r4, r12
+ ldrleb r6, [r1], #1
+ mov r2, r2, lsl r12
+ mov r3, r3, lsl r12
+ addle r4, r4, #8
+ rsble r12, r4, #8
+ addle r5, r5, #1
+ orrle r3, r3, r6, lsl r12
+
+ ;strd r2, r3, [br, #bool_decoder_range]
+ ;strd r4, r5, [br, #bool_decoder_count]
+ str r2, [br, #bool_decoder_range]
+ str r3, [br, #bool_decoder_value]
+ str r4, [br, #bool_decoder_count]
+ str r5, [br, #bool_decoder_pos]
+
+ mov r0, lr
+
+ ldmia sp!, {r4 - r6, pc}
+ ENDP ; |vp8dx_decode_bool_v6|
+
+ END
diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
new file mode 100644
index 000000000..3daa9b34f
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
@@ -0,0 +1,202 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_dc_idct_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
+|vp8_dequant_dc_idct_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r6, [sp, #36] ;load Dc
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r0, [sp]
+
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ mov r12, #3
+
+dequant_dc_idct_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne dequant_dc_idct_loop
+
+ sub r0, r0, #32
+ mov r1, r2
+ mov r2, r3
+
+; short_idct4x4llm_v6_dual
+
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ mov r5, #0x2 ; i=2 i
+loop1_dual_11
+ ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
+ ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
+ ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
+
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
+ pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
+ smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
+ pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
+ smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
+ subs r5, r5, #0x1 ; i-- --
+ pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
+ ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
+ pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+ usub16 r7, r8, r7 ; c c
+ uadd16 r6, r6, r10 ; d d
+ uadd16 r10, r11, r14 ; a a
+ usub16 r8, r11, r14 ; b b
+ uadd16 r9, r10, r6 ; a+d a+d
+ usub16 r10, r10, r6 ; a-d a-d
+ uadd16 r6, r8, r7 ; b+c b+c
+ usub16 r7, r8, r7 ; b-c b-c
+ str r6, [r1, r2] ; o5 | o4
+ add r6, r2, r2 ; pitch * 2 p2
+ str r7, [r1, r6] ; o9 | o8
+ add r6, r6, r2 ; pitch * 3 p3
+ str r10, [r1, r6] ; o13 | o12
+ str r9, [r1], #0x4 ; o1 | o0 ++
+ bne loop1_dual_11 ;
+ mov r5, #0x2 ; i=2 i
+ sub r0, r1, #8 ; reset input/output i/o
+loop2_dual_22
+ ldr r6, [r0, r2] ; i5 | i4 5|4
+ ldr r1, [r0] ; i1 | i0 1|0
+ ldr r12, [r0, #0x4] ; i3 | i2 3|2
+ add r14, r2, #0x4 ; pitch + 2 p+2
+ ldr r14, [r0, r14] ; i7 | i6 7|6
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
+ pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
+ pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
+ pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
+ uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
+ uadd16 r10, r11, r9 ; a a
+ usub16 r9, r11, r9 ; b b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
+ subs r5, r5, #0x1 ; i-- --
+ smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
+ smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
+
+ pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
+ usub16 r12, r8, r6 ; c (o1 | o5) c
+ uadd16 r6, r11, r1 ; d (o3 | o7) d
+ uadd16 r7, r10, r6 ; a+d a+d
+ mov r8, #0x4 ; set up 4's 4
+ orr r8, r8, #0x40000 ; 4|4
+ usub16 r6, r10, r6 ; a-d a-d
+ uadd16 r6, r6, r8 ; a-d+4 3|7
+ uadd16 r7, r7, r8 ; a+d+4 0|4
+ uadd16 r10, r9, r12 ; b+c b+c
+ usub16 r1, r9, r12 ; b-c b-c
+ uadd16 r10, r10, r8 ; b+c+4 1|5
+ uadd16 r1, r1, r8 ; b-c+4 2|6
+ mov r8, r10, asr #19 ; o1 >> 3
+ strh r8, [r0, #2] ; o1
+ mov r8, r1, asr #19 ; o2 >> 3
+ strh r8, [r0, #4] ; o2
+ mov r8, r6, asr #19 ; o3 >> 3
+ strh r8, [r0, #6] ; o3
+ mov r8, r7, asr #19 ; o0 >> 3
+ strh r8, [r0], r2 ; o0 +p
+ sxth r10, r10 ;
+ mov r8, r10, asr #3 ; o5 >> 3
+ strh r8, [r0, #2] ; o5
+ sxth r1, r1 ;
+ mov r8, r1, asr #3 ; o6 >> 3
+ strh r8, [r0, #4] ; o6
+ sxth r6, r6 ;
+ mov r8, r6, asr #3 ; o7 >> 3
+ strh r8, [r0, #6] ; o7
+ sxth r7, r7 ;
+ mov r8, r7, asr #3 ; o4 >> 3
+ strh r8, [r0], r2 ; o4 +p
+;;;;; subs r5, r5, #0x1 ; i-- --
+ bne loop2_dual_22 ;
+
+
+;vpx_memset
+ ldr r0, [sp]
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+
+ ENDP ;|vp8_dequant_dc_idct_v68|
+
+ END
diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm
new file mode 100644
index 000000000..61bb48d04
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantidct_v6.asm
@@ -0,0 +1,183 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_idct_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
+|vp8_dequant_idct_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r0, [sp]
+
+ mov r12, #4
+
+dequant_idct_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne dequant_idct_loop
+
+ sub r0, r0, #32
+ mov r1, r2
+ mov r2, r3
+
+; short_idct4x4llm_v6_dual
+
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ mov r5, #0x2 ; i=2 i
+loop1_dual_1
+ ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
+ ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
+ ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
+
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
+ pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
+ smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
+ pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
+ smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
+ subs r5, r5, #0x1 ; i-- --
+ pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
+ ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
+ pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+ usub16 r7, r8, r7 ; c c
+ uadd16 r6, r6, r10 ; d d
+ uadd16 r10, r11, r14 ; a a
+ usub16 r8, r11, r14 ; b b
+ uadd16 r9, r10, r6 ; a+d a+d
+ usub16 r10, r10, r6 ; a-d a-d
+ uadd16 r6, r8, r7 ; b+c b+c
+ usub16 r7, r8, r7 ; b-c b-c
+ str r6, [r1, r2] ; o5 | o4
+ add r6, r2, r2 ; pitch * 2 p2
+ str r7, [r1, r6] ; o9 | o8
+ add r6, r6, r2 ; pitch * 3 p3
+ str r10, [r1, r6] ; o13 | o12
+ str r9, [r1], #0x4 ; o1 | o0 ++
+ bne loop1_dual_1 ;
+ mov r5, #0x2 ; i=2 i
+ sub r0, r1, #8 ; reset input/output i/o
+loop2_dual_2
+ ldr r6, [r0, r2] ; i5 | i4 5|4
+ ldr r1, [r0] ; i1 | i0 1|0
+ ldr r12, [r0, #0x4] ; i3 | i2 3|2
+ add r14, r2, #0x4 ; pitch + 2 p+2
+ ldr r14, [r0, r14] ; i7 | i6 7|6
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
+ pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
+ pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
+ pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
+ uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
+ uadd16 r10, r11, r9 ; a a
+ usub16 r9, r11, r9 ; b b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
+ subs r5, r5, #0x1 ; i-- --
+ smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
+ smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
+
+ pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
+ usub16 r12, r8, r6 ; c (o1 | o5) c
+ uadd16 r6, r11, r1 ; d (o3 | o7) d
+ uadd16 r7, r10, r6 ; a+d a+d
+ mov r8, #0x4 ; set up 4's 4
+ orr r8, r8, #0x40000 ; 4|4
+ usub16 r6, r10, r6 ; a-d a-d
+ uadd16 r6, r6, r8 ; a-d+4 3|7
+ uadd16 r7, r7, r8 ; a+d+4 0|4
+ uadd16 r10, r9, r12 ; b+c b+c
+ usub16 r1, r9, r12 ; b-c b-c
+ uadd16 r10, r10, r8 ; b+c+4 1|5
+ uadd16 r1, r1, r8 ; b-c+4 2|6
+ mov r8, r10, asr #19 ; o1 >> 3
+ strh r8, [r0, #2] ; o1
+ mov r8, r1, asr #19 ; o2 >> 3
+ strh r8, [r0, #4] ; o2
+ mov r8, r6, asr #19 ; o3 >> 3
+ strh r8, [r0, #6] ; o3
+ mov r8, r7, asr #19 ; o0 >> 3
+ strh r8, [r0], r2 ; o0 +p
+ sxth r10, r10 ;
+ mov r8, r10, asr #3 ; o5 >> 3
+ strh r8, [r0, #2] ; o5
+ sxth r1, r1 ;
+ mov r8, r1, asr #3 ; o6 >> 3
+ strh r8, [r0, #4] ; o6
+ sxth r6, r6 ;
+ mov r8, r6, asr #3 ; o7 >> 3
+ strh r8, [r0, #6] ; o7
+ sxth r7, r7 ;
+ mov r8, r7, asr #3 ; o4 >> 3
+ strh r8, [r0], r2 ; o4 +p
+;;;;; subs r5, r5, #0x1 ; i-- --
+ bne loop2_dual_2 ;
+ ;
+
+;vpx_memset
+ ldr r0, [sp]
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+
+ ENDP ;|vp8_dequant_idct_v6|
+
+ END
diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/decoder/arm/armv6/dequantize_v6.asm
new file mode 100644
index 000000000..95e38594f
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
@@ -0,0 +1,68 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_v6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------
+;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+ stmdb sp!, {r4-r9, lr}
+
+ ldr r3, [r0] ;load Q
+ ldr r4, [r1] ;load DQC
+ ldr r5, [r0, #4]
+ ldr r6, [r1, #4]
+
+ mov r12, #2 ;loop counter
+
+dequant_loop
+ smulbb r7, r3, r4 ;multiply
+ smultt r8, r3, r4
+ smulbb r9, r5, r6
+ smultt lr, r5, r6
+
+ ldr r3, [r0, #8]
+ ldr r4, [r1, #8]
+ ldr r5, [r0, #12]
+ ldr r6, [r1, #12]
+
+ strh r7, [r2], #2 ;store result
+ smulbb r7, r3, r4 ;multiply
+ strh r8, [r2], #2
+ smultt r8, r3, r4
+ strh r9, [r2], #2
+ smulbb r9, r5, r6
+ strh lr, [r2], #2
+ smultt lr, r5, r6
+
+ subs r12, r12, #1
+
+ add r0, r0, #16
+ add r1, r1, #16
+
+ ldrne r3, [r0]
+ strh r7, [r2], #2 ;store result
+ ldrne r4, [r1]
+ strh r8, [r2], #2
+ ldrne r5, [r0, #4]
+ strh r9, [r2], #2
+ ldrne r6, [r1, #4]
+ strh lr, [r2], #2
+
+ bne dequant_loop
+
+ ldmia sp!, {r4-r9, pc}
+ ENDP ;|vp8_dequantize_b_loop_v6|
+
+ END
diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h
new file mode 100644
index 000000000..495004f9c
--- /dev/null
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -0,0 +1,49 @@
+#ifndef DBOOLHUFF_ARM_H
+#define DBOOLHUFF_ARM_H
+
+/* JLK
+ * There are currently no arm-optimized versions of
+ * these functions. As they are implemented, they
+ * can be uncommented below and added to
+ * arm/dsystemdependent.c
+ *
+ * The existing asm code is likely so different as
+ * to be useless. However, its been left (for now)
+ * for reference.
+ */
+/*
+#if HAVE_ARMV6
+#undef vp8_dbool_start
+#define vp8_dbool_start vp8dx_start_decode_v6
+
+#undef vp8_dbool_stop
+#define vp8_dbool_stop vp8dx_stop_decode_v6
+
+#undef vp8_dbool_fill
+#define vp8_dbool_fill vp8_bool_decoder_fill_v6
+
+#undef vp8_dbool_debool
+#define vp8_dbool_debool vp8_decode_bool_v6
+
+#undef vp8_dbool_devalue
+#define vp8_dbool_devalue vp8_decode_value_v6
+#endif // HAVE_ARMV6
+
+#if HAVE_ARMV7
+#undef vp8_dbool_start
+#define vp8_dbool_start vp8dx_start_decode_neon
+
+#undef vp8_dbool_stop
+#define vp8_dbool_stop vp8dx_stop_decode_neon
+
+#undef vp8_dbool_fill
+#define vp8_dbool_fill vp8_bool_decoder_fill_neon
+
+#undef vp8_dbool_debool
+#define vp8_dbool_debool vp8_decode_bool_neon
+
+#undef vp8_dbool_devalue
+#define vp8_dbool_devalue vp8_decode_value_neon
+#endif // HAVE_ARMV7
+*/
+#endif // DBOOLHUFF_ARM_H
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c
new file mode 100644
index 000000000..54006a921
--- /dev/null
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "predictdc.h"
+#include "idct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_ARMV7
+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV6
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV7
+
+void vp8_dequantize_b_neon(BLOCKD *d)
+{
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = &d->dequant[0][0];
+
+ vp8_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_ARMV6
+void vp8_dequantize_b_v6(BLOCKD *d)
+{
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = &d->dequant[0][0];
+
+ vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
new file mode 100644
index 000000000..c8a61a4a7
--- /dev/null
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_ARM_H
+#define DEQUANTIZE_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_dequant_block(vp8_dequantize_b_v6);
+extern prototype_dequant_idct(vp8_dequant_idct_v6);
+extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
+
+#undef vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_v6
+
+#undef vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_v6
+
+#undef vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_dequant_block(vp8_dequantize_b_neon);
+extern prototype_dequant_idct(vp8_dequant_idct_neon);
+extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
+
+#undef vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_neon
+
+#undef vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_neon
+
+#undef vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
+#endif
+
+#endif
diff --git a/vp8/decoder/arm/detokenizearm_sjl.c b/vp8/decoder/arm/detokenizearm_sjl.c
new file mode 100644
index 000000000..c714452a6
--- /dev/null
+++ b/vp8/decoder/arm/detokenizearm_sjl.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "type_aliases.h"
+#include "blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define BR_COUNT 8
+#define BOOL_DATA UINT8
+
+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
+//ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X};
+DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+
+#define EOB_CONTEXT_NODE 0
+#define ZERO_CONTEXT_NODE 1
+#define ONE_CONTEXT_NODE 2
+#define LOW_VAL_CONTEXT_NODE 3
+#define TWO_CONTEXT_NODE 4
+#define THREE_CONTEXT_NODE 5
+#define HIGH_LOW_CONTEXT_NODE 6
+#define CAT_ONE_CONTEXT_NODE 7
+#define CAT_THREEFOUR_CONTEXT_NODE 8
+#define CAT_THREE_CONTEXT_NODE 9
+#define CAT_FIVE_CONTEXT_NODE 10
+
+
+
+
+DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
+{
+ { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ZERO_TOKEN
+ { 1, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ONE_TOKEN
+ { 2, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //TWO_TOKEN
+ { 3, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //THREE_TOKEN
+ { 4, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //FOUR_TOKEN
+ { 5, 0, { 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY1
+ { 7, 1, { 145, 165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY2
+ { 11, 2, { 140, 148, 173, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY3
+ { 19, 3, { 135, 140, 155, 176, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY4
+ { 35, 4, { 130, 134, 141, 157, 180, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY5
+ { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0 } }, //DCT_VAL_CATEGORY6
+ { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, // EOB TOKEN
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+ ENTROPY_CONTEXT **const A = x->above_context;
+ ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+
+ ENTROPY_CONTEXT *a;
+ ENTROPY_CONTEXT *l;
+ int i;
+
+ for (i = 0; i < 24; i++)
+ {
+
+ a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+ l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+
+ *a = *l = 0;
+ }
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ {
+ a = A[Y2CONTEXT] + vp8_block2above[24];
+ l = L[Y2CONTEXT] + vp8_block2left[24];
+ *a = *l = 0;
+ }
+
+
+}
+
+#define ONYXBLOCK2CONTEXT_OFFSET 0
+#define ONYXBLOCK2LEFT_OFFSET 25
+#define ONYXBLOCK2ABOVE_OFFSET 50
+
+DECLARE_ALIGNED(16, const static unsigned char, norm[128]) =
+{
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+void init_detokenizer(VP8D_COMP *dx)
+{
+ const VP8_COMMON *const oc = & dx->common;
+ MACROBLOCKD *x = & dx->mb;
+
+ dx->detoken.norm_ptr = (unsigned char *)norm;
+ dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree;
+ dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove;
+ dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x;
+ dx->detoken.scan = (int *)vp8_default_zig_zag1d;
+ dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2;
+
+ dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+
+ dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]);
+ dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]);
+ dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]);
+ dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]);
+
+}
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+
+//shift = norm[range]; \
+// shift = norm_ptr[range]; \
+
+#define NORMALIZE \
+ /*if(range < 0x80)*/ \
+ { \
+ shift = detoken->norm_ptr[range]; \
+ range <<= shift; \
+ value <<= shift; \
+ count -= shift; \
+ if(count <= 0) \
+ { \
+ count += BR_COUNT ; \
+ value |= (*bufptr) << (BR_COUNT-count); \
+ bufptr++; \
+ } \
+ }
+#if 1
+#define DECODE_AND_APPLYSIGN(value_to_sign) \
+ split = (range + 1) >> 1; \
+ if ( (value >> 24) < split ) \
+ { \
+ range = split; \
+ v= value_to_sign; \
+ } \
+ else \
+ { \
+ range = range-split; \
+ value = value-(split<<24); \
+ v = -value_to_sign; \
+ } \
+ range +=range; \
+ value +=value; \
+ if (!--count) \
+ { \
+ count = BR_COUNT; \
+ value |= *bufptr; \
+ bufptr++; \
+ }
+
+#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
+ { \
+ split = 1 + ((( probability*(range-1) ) )>> 8); \
+ if ( (value >> 24) < split ) \
+ { \
+ range = split; \
+ NORMALIZE \
+ goto branch; \
+ } \
+ value -= (split<<24); \
+ range = range - split; \
+ NORMALIZE \
+ }
+
+#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
+ { \
+ split = 1 + ((( probability*(range-1) ) ) >> 8); \
+ if ( (value >> 24) < split ) \
+ { \
+ range = split; \
+ NORMALIZE \
+ Prob = coef_probs; \
+ ++c; \
+ Prob += vp8_coef_bands_x[c]; \
+ goto branch; \
+ } \
+ value -= (split<<24); \
+ range = range - split; \
+ NORMALIZE \
+ }
+
+#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
+ DECODE_AND_APPLYSIGN(val) \
+ Prob = coef_probs + (ENTROPY_NODES*2); \
+ if(c < 15){\
+ qcoeff_ptr [ scan[c] ] = (INT16) v; \
+ ++c; \
+ goto DO_WHILE; }\
+ qcoeff_ptr [ scan[15] ] = (INT16) v; \
+ goto BLOCK_FINISHED;
+
+
+#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
+ split = 1 + (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
+ if(value >= (split<<24))\
+ {\
+ range = range-split;\
+ value = value-(split<<24);\
+ val += ((UINT16)1<<bits_count);\
+ }\
+ else\
+ {\
+ range = split;\
+ }\
+ NORMALIZE
+#endif
+
+#if 0
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+ ENTROPY_CONTEXT **const A = x->above_context;
+ ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+ const VP8_COMMON *const oc = & dx->common;
+
+ BOOL_DECODER *bc = x->current_bc;
+
+ ENTROPY_CONTEXT *a;
+ ENTROPY_CONTEXT *l;
+ int i;
+
+ int eobtotal = 0;
+
+ register int count;
+
+ BOOL_DATA *bufptr;
+ register unsigned int range;
+ register unsigned int value;
+ const int *scan;
+ register unsigned int shift;
+ UINT32 split;
+ INT16 *qcoeff_ptr;
+
+ UINT8 *coef_probs;
+ int type;
+ int stop;
+ INT16 val, bits_count;
+ INT16 c;
+ INT16 t;
+ INT16 v;
+ vp8_prob *Prob;
+
+ //int *scan;
+ type = 3;
+ i = 0;
+ stop = 16;
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ {
+ i = 24;
+ stop = 24;
+ type = 1;
+ qcoeff_ptr = &x->qcoeff[24*16];
+ scan = vp8_default_zig_zag1d;
+ eobtotal -= 16;
+ }
+ else
+ {
+ scan = vp8_default_zig_zag1d;
+ qcoeff_ptr = &x->qcoeff[0];
+ }
+
+ count = bc->count;
+ range = bc->range;
+ value = bc->value;
+ bufptr = &bc->buffer[bc->pos];
+
+
+ coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+
+BLOCK_LOOP:
+ a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+ l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+ c = (INT16)(!type);
+
+ VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+ Prob = coef_probs;
+ Prob += t * ENTROPY_NODES;
+
+DO_WHILE:
+ Prob += vp8_coef_bands_x[c];
+ DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
+
+CHECK_0_:
+ DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_);
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
+ bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
+
+ do
+ {
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
+ bits_count -- ;
+ }
+ while (bits_count >= 0);
+
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_FIVE_CONTEXT_NODE_0_:
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREEFOUR_CONTEXT_NODE_0_:
+ DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_);
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREE_CONTEXT_NODE_0_:
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+HIGH_LOW_CONTEXT_NODE_0_:
+ DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_);
+
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_ONE_CONTEXT_NODE_0_:
+ val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
+ DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+LOW_VAL_CONTEXT_NODE_0_:
+ DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
+ DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
+
+THREE_CONTEXT_NODE_0_:
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
+
+TWO_CONTEXT_NODE_0_:
+ DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
+
+ONE_CONTEXT_NODE_0_:
+ DECODE_AND_APPLYSIGN(1);
+ Prob = coef_probs + ENTROPY_NODES;
+
+ if (c < 15)
+ {
+ qcoeff_ptr [ scan[c] ] = (INT16) v;
+ ++c;
+ goto DO_WHILE;
+ }
+
+ qcoeff_ptr [ scan[15] ] = (INT16) v;
+BLOCK_FINISHED:
+ t = ((x->Block[i].eob = c) != !type); // any nonzero data?
+ eobtotal += x->Block[i].eob;
+ *a = *l = t;
+ qcoeff_ptr += 16;
+
+ i++;
+
+ if (i < stop)
+ goto BLOCK_LOOP;
+
+ if (i == 25)
+ {
+ scan = vp8_default_zig_zag1d;//x->scan_order1d;
+ type = 0;
+ i = 0;
+ stop = 16;
+ coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+ qcoeff_ptr = &x->qcoeff[0];
+ goto BLOCK_LOOP;
+ }
+
+ if (i == 16)
+ {
+ type = 2;
+ coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+ stop = 24;
+ goto BLOCK_LOOP;
+ }
+
+ bc->count = count;
+ bc->value = value;
+ bc->range = range;
+ bc->pos = bufptr - bc->buffer;
+ return eobtotal;
+
+}
+//#endif
+#else
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+#if 0
+//uses relative offsets
+
+const vp8_tree_index vp8_coef_tree_x[ 22] = /* corresponding _CONTEXT_NODEs */
+{
+ -DCT_EOB_TOKEN, 1, /* 0 = EOB */
+ -ZERO_TOKEN, 1, /* 1 = ZERO */
+ -ONE_TOKEN, 1, /* 2 = ONE */
+ 2, 5, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 1, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 2, 3, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 2, 3, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+#endif
+
+#define _SCALEDOWN 8 //16 //8
+
+int vp8_decode_mb_tokens_v5(DETOK *detoken, int type);
+
+int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type)
+{
+ BOOL_DECODER *bc = detoken->current_bc;
+
+ ENTROPY_CONTEXT *a;
+ ENTROPY_CONTEXT *l;
+ int i;
+
+ register int count;
+
+ BOOL_DATA *bufptr;
+ register unsigned int range;
+ register unsigned int value;
+ register unsigned int shift;
+ UINT32 split;
+ INT16 *qcoeff_ptr;
+
+ UINT8 *coef_probs;
+// int type;
+ int stop;
+ INT16 c;
+ INT16 t;
+ INT16 v;
+ vp8_prob *Prob;
+
+
+
+// type = 3;
+ i = 0;
+ stop = 16;
+ qcoeff_ptr = detoken->qcoeff_start_ptr;
+
+// if( detoken->mode != B_PRED && detoken->mode != SPLITMV)
+ if (type == 1)
+ {
+ i += 24;
+ stop += 8; //24;
+// type = 1;
+ qcoeff_ptr += 24 * 16;
+// eobtotal-=16;
+ }
+
+ count = bc->count;
+ range = bc->range;
+ value = bc->value;
+ bufptr = &bc->buffer[bc->pos];
+
+
+ coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+
+BLOCK_LOOP:
+ a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ];
+ l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ];
+ c = !type;
+ a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET];
+ l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET];
+
+ //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+ //Dest = ((A)!=0) + ((B)!=0);
+
+ VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+
+ Prob = coef_probs;
+ Prob += t * ENTROPY_NODES;
+ t = 0;
+
+ do
+ {
+
+ {
+// onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x;
+
+ Prob += detoken->ptr_onyx_coef_bands_x[c];
+
+ GET_TOKEN_START:
+
+ do
+ {
+ split = 1 + (((range - 1) * (Prob[t>>1])) >> 8);
+
+ if (value >> 24 >= split)
+ {
+ range = range - split;
+ value = value - (split << 24);
+ t += 1;
+
+ //used to eliminate else branch
+ split = range;
+ }
+
+ range = split;
+
+ t = detoken->vp8_coef_tree_ptr[ t ];
+
+ NORMALIZE
+
+ }
+ while (t > 0) ;
+ }
+ GET_TOKEN_STOP:
+
+ if (t == -DCT_EOB_TOKEN)
+ {
+ break;
+ }
+
+ v = -t;
+
+ if (v > FOUR_TOKEN)
+ {
+ INT16 bits_count;
+ TOKENEXTRABITS *teb_ptr;
+
+// teb_ptr = &onyxd_token_extra_bits2[t];
+// teb_ptr = &onyxd_token_extra_bits2[v];
+ teb_ptr = &detoken->teb_base_ptr[v];
+
+
+ v = teb_ptr->min_val;
+ bits_count = teb_ptr->Length;
+
+ do
+ {
+ split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN);
+
+ if ((value >> 24) >= split)
+ {
+ range = range - split;
+ value = value - (split << 24);
+ v += ((UINT16)1 << bits_count);
+
+ //used to eliminate else branch
+ split = range;
+ }
+
+ range = split;
+
+ NORMALIZE
+
+ bits_count -- ;
+ }
+ while (bits_count >= 0);
+ }
+
+ Prob = coef_probs;
+
+ if (t)
+ {
+ split = 1 + (((range - 1) * vp8_prob_half) >> 8);
+
+ if ((value >> 24) >= split)
+ {
+ range = range - split;
+ value = value - (split << 24);
+ v = (v ^ -1) + 1; /* negate w/out conditionals */
+
+ //used to eliminate else branch
+ split = range;
+ }
+
+ range = split;
+
+ NORMALIZE
+ Prob += ENTROPY_NODES;
+
+ if (t < -ONE_TOKEN)
+ Prob += ENTROPY_NODES;
+
+ t = -2;
+ }
+
+ //if t is zero, we will skip the eob table check
+ t += 2;
+ qcoeff_ptr [detoken->scan [c] ] = (INT16) v;
+
+ }
+ while (++c < 16);
+
+ if (t != -DCT_EOB_TOKEN)
+ {
+ --c;
+ }
+
+ t = ((detoken->eob[i] = c) != !type); // any nonzero data?
+// eobtotal += detoken->eob[i];
+ *a = *l = t;
+ qcoeff_ptr += 16;
+
+ i++;
+
+ if (i < stop)
+ goto BLOCK_LOOP;
+
+ if (i == 25)
+ {
+ type = 0;
+ i = 0;
+ stop = 16;
+// coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+ coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+ qcoeff_ptr = detoken->qcoeff_start_ptr;
+ goto BLOCK_LOOP;
+ }
+
+ if (i == 16)
+ {
+ type = 2;
+// coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+ coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+ stop = 24;
+ goto BLOCK_LOOP;
+ }
+
+ bc->count = count;
+ bc->value = value;
+ bc->range = range;
+ bc->pos = bufptr - bc->buffer;
+ return 0;
+}
+//#if 0
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+// const ONYX_COMMON * const oc = & dx->common;
+ int eobtotal = 0;
+ int i, type;
+ /*
+ dx->detoken.norm_ptr = norm;
+ dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree;
+ dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE;
+ dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x;
+ dx->detoken.scan = default_zig_zag1d;
+ dx->detoken.teb_base_ptr = onyxd_token_extra_bits2;
+
+ dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+ dx->detoken.A = x->above_context;
+ dx->detoken.L = x->left_context;
+
+ dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]);
+ dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]);
+ dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]);
+ dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]);
+ */
+
+ dx->detoken.current_bc = x->current_bc;
+ dx->detoken.A = x->above_context;
+ dx->detoken.L = x->left_context;
+
+ type = 3;
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ {
+ type = 1;
+ eobtotal -= 16;
+ }
+
+ vp8_decode_mb_tokens_v5(&dx->detoken, type);
+
+ for (i = 0; i < 25; i++)
+ {
+ x->Block[i].eob = dx->detoken.eob[i];
+ eobtotal += dx->detoken.eob[i];
+ }
+
+ return eobtotal;
+}
+#endif
diff --git a/vp8/decoder/arm/detokenizearm_v6.asm b/vp8/decoder/arm/detokenizearm_v6.asm
new file mode 100644
index 000000000..4d87ee5bd
--- /dev/null
+++ b/vp8/decoder/arm/detokenizearm_v6.asm
@@ -0,0 +1,364 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_decode_mb_tokens_v5|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ INCLUDE vpx_asm_offsets.asm
+
+l_qcoeff EQU 0
+l_i EQU 4
+l_type EQU 8
+l_stop EQU 12
+l_c EQU 16
+l_l_ptr EQU 20
+l_a_ptr EQU 24
+l_bc EQU 28
+l_coef_ptr EQU 32
+l_stacksize EQU 64
+
+
+;; constant offsets -- these should be created at build time
+c_onyxblock2left_offset EQU 25
+c_onyxblock2above_offset EQU 50
+c_entropy_nodes EQU 11
+c_dct_eob_token EQU 11
+
+|vp8_decode_mb_tokens_v5| PROC
+ stmdb sp!, {r4 - r11, lr}
+ sub sp, sp, #l_stacksize
+ mov r7, r1
+ mov r9, r0 ;DETOK *detoken
+
+ ldr r1, [r9, #detok_current_bc]
+ ldr r0, [r9, #detok_qcoeff_start_ptr]
+ mov r11, #0
+ mov r3, #0x10
+
+ cmp r7, #1
+ addeq r11, r11, #24
+ addeq r3, r3, #8
+ addeq r0, r0, #3, 24
+
+ str r0, [sp, #l_qcoeff]
+ str r11, [sp, #l_i]
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+ str r1, [sp, #l_bc]
+
+ add lr, r9, r7, lsl #2
+
+ ldr r2, [r1, #bool_decoder_buffer]
+ ldr r3, [r1, #bool_decoder_pos]
+
+ ldr r10, [lr, #detok_coef_probs]
+ ldr r5, [r1, #bool_decoder_count]
+ ldr r6, [r1, #bool_decoder_range]
+ ldr r4, [r1, #bool_decoder_value]
+ add r8, r2, r3
+
+ str r10, [sp, #l_coef_ptr]
+
+
+ ;align 4
+BLOCK_LOOP
+ ldr r3, [r9, #detok_ptr_onyxblock2context_leftabove]
+ ldr r2, [r9, #DETOK_A]
+ ldr r1, [r9, #DETOK_L]
+ ldrb r12, [r3, +r11] ; detoken->ptr_onyxblock2context_leftabove[i]
+
+ cmp r7, #0 ; check type
+ moveq r7, #1
+ movne r7, #0
+
+ ldr r0, [r2, +r12, lsl #2] ; a
+ add r1, r1, r12, lsl #4
+ add r3, r3, r11
+
+ ldrb r2, [r3, #c_onyxblock2above_offset]
+ ldrb r3, [r3, #c_onyxblock2left_offset]
+ mov lr, #c_entropy_nodes
+;; ;++
+
+ ldr r2, [r0, +r2, lsl #2]!
+ add r3, r1, r3, lsl #2
+ str r3, [sp, #l_l_ptr]
+ ldr r3, [r3]
+
+ cmp r2, #0
+ movne r2, #1
+ cmp r3, #0
+ addne r2, r2, #1
+
+ str r0, [sp, #l_a_ptr]
+ smlabb r0, r2, lr, r10
+ mov r1, #0 ; t = 0
+ str r7, [sp, #l_c]
+
+ ;align 4
+COEFF_LOOP
+ ldr r3, [r9, #detok_ptr_onyx_coef_bands_x]
+ ldr lr, [r9, #detok_onyx_coef_tree_ptr]
+
+;;the following two lines are used if onyx_coef_bands_x is UINT16
+;; add r3, r3, r7, lsl #1
+;; ldrh r3, [r3]
+
+;;the following line is used if onyx_coef_bands_x is UINT8
+ ldrb r3, [r7, +r3]
+
+
+;; ;++
+;; pld [r8]
+ ;++
+ add r0, r0, r3
+
+ ;align 4
+get_token_loop
+ ldrb r2, [r0, +r1, asr #1]
+ mov r3, r6, lsl #8
+ sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8)
+ mov r10, #1
+
+ smlawb r2, r3, r2, r10
+ ldrb r12, [r8] ;load cx data byte in stall slot
+ ;++
+
+ subs r3, r4, r2, lsl #24 ;x = value-(split<<24)
+ addhs r1, r1, #1 ;t += 1
+ movhs r4, r3 ;update value
+ subhs r2, r6, r2 ;range = range - split
+ movlo r6, r2
+
+;;; ldrsbhs r1, [r1, +lr]
+ ldrsb r1, [r1, +lr]
+
+
+;; use branch for short pipelines ???
+;; cmp r2, #0x80
+;; bcs |$LN22@decode_mb_to|
+
+ clz r3, r2
+ sub r3, r3, #24
+ subs r5, r5, r3
+ mov r6, r2, lsl r3
+ mov r4, r4, lsl r3
+
+;; use branch for short pipelines ???
+;; bgt |$LN22@decode_mb_to|
+
+ addle r5, r5, #8
+ rsble r3, r5, #8
+ addle r8, r8, #1
+ orrle r4, r4, r12, lsl r3
+
+;;|$LN22@decode_mb_to|
+
+ cmp r1, #0
+ bgt get_token_loop
+
+ cmn r1, #c_dct_eob_token ;if(t == -DCT_EOB_TOKEN)
+ beq END_OF_BLOCK
+
+ rsb lr, r1, #0 ;v = -t;
+
+ cmp lr, #4 ;if(v > FOUR_TOKEN)
+ ble SKIP_EXTRABITS
+
+ ldr r3, [r9, #detok_teb_base_ptr]
+ mov r11, #1
+ add r7, r3, lr, lsl #4
+
+ ldrsh lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val
+ ldrsh r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length
+
+extrabits_loop
+ add r3, r0, r7
+
+ ldrb r2, [r3, #4]
+ mov r3, r6, lsl #8
+ sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8)
+ mov r10, #1
+
+ smlawb r2, r3, r2, r10
+ ldrb r12, [r8]
+ ;++
+
+ subs r10, r4, r2, lsl #24 ;x = value-(split<<24)
+ movhs r4, r10 ;update value
+ subhs r2, r6, r2 ;range = range - split
+ addhs lr, lr, r11, lsl r0 ;v += ((UINT16)1<<bits_count)
+ movlo r6, r2 ;range = split
+
+
+;; use branch for short pipelines ???
+;; cmp r2, #0x80
+;; bcs |$LN10@decode_mb_to|
+
+ clz r3, r2
+ sub r3, r3, #24
+ subs r5, r5, r3
+ mov r6, r2, lsl r3 ;range
+ mov r4, r4, lsl r3 ;value
+
+ addle r5, r5, #8
+ addle r8, r8, #1
+ rsble r3, r5, #8
+ orrle r4, r4, r12, lsl r3
+
+;;|$LN10@decode_mb_to|
+ subs r0, r0, #1
+ bpl extrabits_loop
+
+
+SKIP_EXTRABITS
+ ldr r11, [sp, #l_qcoeff]
+ ldr r0, [sp, #l_coef_ptr]
+
+ cmp r1, #0 ;check for nonzero token
+ beq SKIP_EOB_CHECK ;if t is zero, we will skip the eob table chec
+
+ sub r3, r6, #1 ;range - 1
+ ;++
+ mov r3, r3, lsl #7 ; *= onyx_prob_half (128)
+ ;++
+ mov r3, r3, lsr #8
+ add r2, r3, #1 ;split
+
+ subs r3, r4, r2, lsl #24 ;x = value-(split<<24)
+ movhs r4, r3 ;update value
+ subhs r2, r6, r2 ;range = range - split
+ mvnhs r3, lr
+ addhs lr, r3, #1 ;v = (v ^ -1) + 1
+ movlo r6, r2 ;range = split
+
+;; use branch for short pipelines ???
+;; cmp r2, #0x80
+;; bcs |$LN6@decode_mb_to|
+
+ clz r3, r2
+ sub r3, r3, #24
+ subs r5, r5, r3
+ mov r6, r2, lsl r3
+ mov r4, r4, lsl r3
+ ldrleb r2, [r8], #1
+ addle r5, r5, #8
+ rsble r3, r5, #8
+ orrle r4, r4, r2, lsl r3
+
+;;|$LN6@decode_mb_to|
+ add r0, r0, #0xB
+
+ cmn r1, #1
+
+ addlt r0, r0, #0xB
+
+ mvn r1, #1
+
+SKIP_EOB_CHECK
+ ldr r7, [sp, #l_c]
+ ldr r3, [r9, #detok_scan]
+ add r1, r1, #2
+ cmp r7, #(0x10 - 1) ;assume one less for now.... increment below
+
+ ldr r3, [r3, +r7, lsl #2]
+ add r7, r7, #1
+ add r3, r11, r3, lsl #1
+
+ str r7, [sp, #l_c]
+ strh lr, [r3]
+
+ blt COEFF_LOOP
+
+ sub r7, r7, #1 ;if(t != -DCT_EOB_TOKEN) --c
+
+END_OF_BLOCK
+ ldr r3, [sp, #l_type]
+ ldr r10, [sp, #l_coef_ptr]
+ ldr r0, [sp, #l_qcoeff]
+ ldr r11, [sp, #l_i]
+ ldr r12, [sp, #l_stop]
+
+ cmp r3, #0
+ moveq r1, #1
+ movne r1, #0
+ add r3, r11, r9
+
+ cmp r7, r1
+ strb r7, [r3, #detok_eob]
+
+ ldr r7, [sp, #l_l_ptr]
+ ldr r2, [sp, #l_a_ptr]
+ movne r3, #1
+ moveq r3, #0
+
+ add r0, r0, #0x20
+ add r11, r11, #1
+ str r3, [r7]
+ str r3, [r2]
+ str r0, [sp, #l_qcoeff]
+ str r11, [sp, #l_i]
+
+ cmp r11, r12 ;i >= stop ?
+ ldr r7, [sp, #l_type]
+ mov lr, #0xB
+
+ blt BLOCK_LOOP
+
+ cmp r11, #0x19
+ bne ln2_decode_mb_to
+
+ ldr r12, [r9, #detok_qcoeff_start_ptr]
+ ldr r10, [r9, #detok_coef_probs]
+ mov r7, #0
+ mov r3, #0x10
+ str r12, [sp, #l_qcoeff]
+ str r7, [sp, #l_i]
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+
+ str r10, [sp, #l_coef_ptr]
+
+ b BLOCK_LOOP
+
+ln2_decode_mb_to
+ cmp r11, #0x10
+ bne ln1_decode_mb_to
+
+ ldr r10, [r9, #0x30]
+
+ mov r7, #2
+ mov r3, #0x18
+
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+
+ str r10, [sp, #l_coef_ptr]
+ b BLOCK_LOOP
+
+ln1_decode_mb_to
+ ldr r2, [sp, #l_bc]
+ mov r0, #0
+ nop
+
+ ldr r3, [r2, #bool_decoder_buffer]
+ str r5, [r2, #bool_decoder_count]
+ str r4, [r2, #bool_decoder_value]
+ sub r3, r8, r3
+ str r3, [r2, #bool_decoder_pos]
+ str r6, [r2, #bool_decoder_range]
+
+ add sp, sp, #l_stacksize
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vp8_decode_mb_tokens_v5|
+
+ END
diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c
new file mode 100644
index 000000000..455c83a9c
--- /dev/null
+++ b/vp8/decoder/arm/dsystemdependent.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "blockd.h"
+#include "pragmas.h"
+#include "postproc.h"
+#include "dboolhuff.h"
+#include "dequantize.h"
+#include "onyxd_int.h"
+
+void vp8_dmachine_specific_config(VP8D_COMP *pbi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+ pbi->mb.rtcd = &pbi->common.rtcd;
+#if HAVE_ARMV7
+ pbi->dequant.block = vp8_dequantize_b_neon;
+ pbi->dequant.idct = vp8_dequant_idct_neon;
+ pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
+ pbi->dboolhuff.start = vp8dx_start_decode_c;
+ pbi->dboolhuff.stop = vp8dx_stop_decode_c;
+ pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
+ pbi->dboolhuff.debool = vp8dx_decode_bool_c;
+ pbi->dboolhuff.devalue = vp8dx_decode_value_c;
+
+#elif HAVE_ARMV6
+ pbi->dequant.block = vp8_dequantize_b_v6;
+ pbi->dequant.idct = vp8_dequant_idct_v6;
+ pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
+ pbi->dboolhuff.start = vp8dx_start_decode_c;
+ pbi->dboolhuff.stop = vp8dx_stop_decode_c;
+ pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
+ pbi->dboolhuff.debool = vp8dx_decode_bool_c;
+ pbi->dboolhuff.devalue = vp8dx_decode_value_c;
+#endif
+#endif
+}
diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm
new file mode 100644
index 000000000..7ec62a3d8
--- /dev/null
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -0,0 +1,159 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_decode_value_neon|
+ EXPORT |vp8dx_start_decode_neon|
+ EXPORT |vp8dx_stop_decode_neon|
+ EXPORT |vp8dx_decode_bool_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ INCLUDE vpx_asm_offsets.asm
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+; int z = 0;
+; int bit;
+; for ( bit=bits-1; bit>=0; bit-- )
+; {
+; z |= (vp8dx_decode_bool(br, 0x80)<<bit);
+; }
+; return z;
+
+;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
+|vp8_decode_value_neon| PROC
+ stmdb sp!, {r4 - r6, lr}
+ mov r4, r0
+ mov r5, r1
+ mov r6, #0
+
+ subs r5, r5, #1
+ bmi decode_value_exit
+
+decode_value_loop
+ mov r1, #0x80
+ mov r0, r4
+ bl vp8dx_decode_bool_neon_internal ; needed for conversion to s file
+ orr r6, r6, r0, lsl r5
+ subs r5, r5, #1
+ bpl decode_value_loop
+
+decode_value_exit
+ mov r0, r6
+ ldmia sp!, {r4 - r6, pc}
+ ENDP ; |vp8_decode_value_neon|
+
+
+;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
+|vp8dx_start_decode_neon| PROC
+ stmdb sp!, {r4 - r5, lr}
+ mov r2, #0
+ mov r3, #255
+
+ str r2, [r0, #bool_decoder_lowvalue]
+ str r3, [r0, #bool_decoder_range]
+ str r1, [r0, #bool_decoder_buffer]
+
+ mov r3, #8
+ mov r2, #4
+ str r3, [r0, #bool_decoder_count]
+ str r2, [r0, #bool_decoder_pos]
+
+ ldrb r2, [r1, #3]
+ ldrb r3, [r1, #2]
+ ldrb r4, [r1, #1]
+ ldrb r5, [r1]
+
+ orr r1, r2, r3, lsl #8
+ orr r1, r1, r4, lsl #16
+ orr r1, r1, r5, lsl #24
+
+ str r1, [r0, #bool_decoder_value]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ; |vp8dx_start_decode_neon|
+
+
+;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
+|vp8dx_stop_decode_neon| PROC
+ mov pc, lr
+ ENDP ; |vp8dx_stop_decode_neon|
+
+
+; bigsplit RN r1
+; buffer_v RN r1
+; count_v RN r4
+; range_v RN r2
+; value_v RN r3
+; pos_v RN r5
+; split RN r6
+; bit RN lr
+;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
+|vp8dx_decode_bool_neon| PROC
+vp8dx_decode_bool_neon_internal
+;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
+;before memory allocation
+ stmdb sp!, {r4 - r5, lr}
+
+ ldr r2, [r0, #bool_decoder_range] ;load range (r2), value(r3)
+ ldr r3, [r0, #bool_decoder_value]
+ ;ldrd r2, r3, [r0, #bool_decoder_range] ;ldrd costs 2 cycles
+ ;
+
+ mov r4, r2, lsl #8
+ sub r4, r4, #256
+ mov r12, #1
+
+ smlawb r4, r4, r1, r12 ;split = 1 + (((range-1) * probability) >> 8)
+
+ mov lr, r0
+ mov r0, #0 ;bit = 0
+ ;
+ subs r5, r3, r4, lsl #24
+
+ subhs r2, r2, r4 ;range = br->range-split
+ movlo r2, r4 ;range = split
+ movhs r0, #1 ;bit = 1
+ movhs r3, r5 ;value = value-bigsplit
+
+ cmp r2, #0x80
+ blt range_less_0x80
+ strd r2, r3, [lr, #bool_decoder_range] ;store result
+
+ ldmia sp!, {r4 - r5, pc}
+
+range_less_0x80
+
+ ldrd r4, r5, [lr, #bool_decoder_count] ;load count, pos, buffer
+ ldr r1, [lr, #bool_decoder_buffer]
+
+ clz r12, r2
+ add r1, r1, r5
+
+ sub r12, r12, #24
+ subs r4, r4, r12 ;count -= shift
+ mov r2, r2, lsl r12 ;range <<= shift
+ mov r3, r3, lsl r12 ;value <<= shift
+ addle r4, r4, #8 ;count += 8
+ ldrleb r12, [r1], #1 ;br->buffer[br->pos]
+
+ rsble r1, r4, #8 ;-count
+ addle r5, r5, #1 ;br->pos++
+ orrle r3, r3, r12, lsl r1 ;value |= (br->buffer[br->pos]) << (-count)
+
+ strd r2, r3, [lr, #bool_decoder_range] ;store result
+ strd r4, r5, [lr, #bool_decoder_count]
+
+ ldmia sp!, {r4 - r5, pc}
+ ENDP ; |vp8dx_decode_bool_neon|
+
+ END
diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequantdcidct_neon.asm
new file mode 100644
index 000000000..3392f2c2b
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantdcidct_neon.asm
@@ -0,0 +1,133 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_dc_idct_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
+; r0 short *input,
+; r1 short *dq,
+; r2 short *output,
+; r3 int pitch,
+; (stack) int Dc
+|vp8_dequant_dc_idct_neon| PROC
+ vld1.16 {q3, q4}, [r0]
+ vld1.16 {q5, q6}, [r1]
+
+ ldr r1, [sp] ;load Dc from stack
+
+ ldr r12, _dcidct_coeff_
+
+ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
+ vmul.i16 q2, q4, q6
+
+ vmov.16 d2[0], r1
+
+;|short_idct4x4llm_neon| PROC
+ vld1.16 {d0}, [r12]
+ vswp d3, d4 ;q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ ;d6 - c1:temp1
+ ;d7 - d1:temp2
+ ;d8 - d1:temp1
+ ;d9 - c1:temp2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+ vmov.i16 q14, #0
+
+ vswp d3, d4
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vmov q15, q14
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vst1.16 {q14, q15}, [r0]
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ add r1, r2, r3
+ add r12, r1, r3
+ add r0, r12, r3
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vst1.16 {d2}, [r2]
+ vst1.16 {d3}, [r1]
+ vst1.16 {d4}, [r12]
+ vst1.16 {d5}, [r0]
+
+ bx lr
+
+ ENDP
+
+;-----------------
+ AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_dcidct_coeff_
+ DCD dcidct_coeff
+dcidct_coeff
+ DCD 0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+ END
diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequantidct_neon.asm
new file mode 100644
index 000000000..bba4d5dfb
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantidct_neon.asm
@@ -0,0 +1,128 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_idct_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
+; r0 short *input,
+; r1 short *dq,
+; r2 short *output,
+; r3 int pitch,
+|vp8_dequant_idct_neon| PROC
+ vld1.16 {q3, q4}, [r0]
+ vld1.16 {q5, q6}, [r1]
+
+ ldr r12, _didct_coeff_
+
+ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
+ vmul.i16 q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+ vld1.16 {d0}, [r12]
+ vswp d3, d4 ;q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ ;d6 - c1:temp1
+ ;d7 - d1:temp2
+ ;d8 - d1:temp1
+ ;d9 - c1:temp2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+ vmov.i16 q14, #0
+
+ vswp d3, d4
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vmov q15, q14
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vst1.16 {q14, q15}, [r0]
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ add r1, r2, r3
+ add r12, r1, r3
+ add r0, r12, r3
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vst1.16 {d2}, [r2]
+ vst1.16 {d3}, [r1]
+ vst1.16 {d4}, [r12]
+ vst1.16 {d5}, [r0]
+
+ bx lr
+
+ ENDP
+
+;-----------------
+ AREA didct4x4_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_didct_coeff_
+ DCD didct_coeff
+didct_coeff
+ DCD 0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+ END
diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/decoder/arm/neon/dequantizeb_neon.asm
new file mode 100644
index 000000000..1bde94607
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
@@ -0,0 +1,33 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+ vld1.16 {q0, q1}, [r0]
+ vld1.16 {q2, q3}, [r1]
+
+ vmul.i16 q4, q0, q2
+ vmul.i16 q5, q1, q3
+
+ vst1.16 {q4, q5}, [r2]
+
+ bx lr
+
+ ENDP
+
+ END