summaryrefslogtreecommitdiff
path: root/src/libFLAC
diff options
context:
space:
mode:
authorJosh Coalson <jcoalson@users.sourceforce.net>2007-03-22 03:22:27 +0000
committerJosh Coalson <jcoalson@users.sourceforce.net>2007-03-22 03:22:27 +0000
commitddddff6a5604da5c7223a075e58ca532d7ad375d (patch)
treebd4d30cce639f0d815e562b484c18e8a9e4d8fae /src/libFLAC
parente4b1706412b37e0b74ce8cc712598d5cccdf9d6e (diff)
downloadflac-ddddff6a5604da5c7223a075e58ca532d7ad375d.tar.gz
add FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap()
Diffstat (limited to 'src/libFLAC')
-rw-r--r--src/libFLAC/Makefile.lite1
-rw-r--r--src/libFLAC/ia32/Makefile.am1
-rw-r--r--src/libFLAC/ia32/bitreader_asm.nasm545
-rw-r--r--src/libFLAC/include/private/bitreader.h8
-rw-r--r--src/libFLAC/stream_decoder.c6
5 files changed, 560 insertions, 1 deletions
diff --git a/src/libFLAC/Makefile.lite b/src/libFLAC/Makefile.lite
index 2d08e710..54188458 100644
--- a/src/libFLAC/Makefile.lite
+++ b/src/libFLAC/Makefile.lite
@@ -54,6 +54,7 @@ else
ifeq ($(SOLARIS_BUILD),yes)
else
SRCS_NASM = \
+ ia32/bitreader_asm.nasm \
ia32/cpu_asm.nasm \
ia32/fixed_asm.nasm \
ia32/lpc_asm.nasm
diff --git a/src/libFLAC/ia32/Makefile.am b/src/libFLAC/ia32/Makefile.am
index 5f010f6a..476633fe 100644
--- a/src/libFLAC/ia32/Makefile.am
+++ b/src/libFLAC/ia32/Makefile.am
@@ -37,6 +37,7 @@ STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh
noinst_LTLIBRARIES = libFLAC-asm.la
libFLAC_asm_la_SOURCES = \
+ bitreader_asm.nasm \
cpu_asm.nasm \
fixed_asm.nasm \
lpc_asm.nasm \
diff --git a/src/libFLAC/ia32/bitreader_asm.nasm b/src/libFLAC/ia32/bitreader_asm.nasm
new file mode 100644
index 00000000..1f976e65
--- /dev/null
+++ b/src/libFLAC/ia32/bitreader_asm.nasm
@@ -0,0 +1,545 @@
+; vim:filetype=nasm ts=8
+
+; libFLAC - Free Lossless Audio Codec library
+; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; - Neither the name of the Xiph.org Foundation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "nasm.h"
+
+ data_section
+
+extern FLAC__crc16_table ; unsigned FLAC__crc16_table[256];
+extern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
+
+cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
+
+ code_section
+
+
+; **********************************************************************
+;
+; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
+;
+; Some details like assertions and other checking is performed by the caller.
+ ALIGN 16
+cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
+
+ ;ASSERT(0 != br);
+ ;ASSERT(0 != br->buffer);
+ ; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
+ ;ASSERT(FLAC__BITS_PER_WORD == 32);
+ ;ASSERT(parameter < 32);
+ ; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
+
+ ;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
+ ;; [esp + 16] unsigned parameter
+ ;; [esp + 12] unsigned nvals
+ ;; [esp + 8] int vals[]
+ ;; [esp + 4] FLAC__BitReader *br
+ mov eax, [esp + 12] ; if(nvals == 0)
+ test eax, eax
+ ja .nvals_gt_0
+ mov eax, 1 ; return true;
+ ret
+
+.nvals_gt_0:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ sub esp, 4
+ ;; [esp + 36] unsigned parameter
+ ;; [esp + 32] unsigned nvals
+ ;; [esp + 28] int vals[]
+ ;; [esp + 24] FLAC__BitReader *br
+ ;; [esp] ucbits
+ mov ebp, [esp + 24] ; ebp <- br == br->buffer
+ mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version)
+ mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version)
+ xor edi, edi ; edi <- 0 'uval'
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ ;; [ebp] br->buffer
+ ;; [ebp + 8] br->words
+ ;; [ebp + 12] br->bytes
+ ;; [ebp + 16] br->consumed_words
+ ;; [ebp + 20] br->consumed_bits
+ ;; [ebp + 24] br->read_crc
+ ;; [ebp + 28] br->crc16_align
+
+ ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+ mov eax, [ebp + 8] ; eax <- br->words
+ sub eax, esi ; eax <- br->words-cwords
+ shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
+ add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
+ shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
+ sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
+ mov [esp], eax ; ucbits <- eax
+
+ ALIGN 16
+.val_loop: ; while(1) {
+
+ ;
+ ; read unary part
+ ;
+.unary_loop: ; while(1) {
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
+ jae near .c1_next1
+.c1_loop: ; {
+ mov ebx, [ebp]
+ mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
+ mov edx, eax ; edx = br->buffer[cwords] (saved for later use)
+ shl eax, cl ; b = br->buffer[cwords] << cbits
+ test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
+ jz near .c1_next2 ; if(b) {
+ bsr ebx, eax
+ not ebx
+ and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
+ add ecx, ebx ; cbits += i;
+ add edi, ebx ; uval += i;
+ add ecx, 1 ; cbits++; /* skip over stop bit */
+ test ecx, ~31
+ jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
+ ; crc16_update_word_(br, br->buffer[cwords]);
+ push edi ; [need more registers]
+ push ecx ; [need more registers]
+ bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
+ mov ecx, [ebp + 28] ; ecx <- br->crc16_align
+ mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
+ xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero]
+ mov edi, FLAC__crc16_table
+ ;; eax (ax) crc a.k.a. br->read_crc
+ ;; ebx (bl) intermediate result index into FLAC__crc16_table[]
+ ;; ecx br->crc16_align
+ ;; edx byteswapped brword to CRC
+ ;; esi cwords
+ ;; edi unsigned FLAC__crc16_table[]
+ ;; ebp br
+ test ecx, ecx ; switch(br->crc16_align) ...
+ jz .c0b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
+ cmp ecx, 8
+ je .c0b1
+ shr edx, 16
+ cmp ecx, 16
+ je .c0b2
+ jmp .c0b3
+.c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
+.c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shr edx, 16
+.c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+.c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ mov [ebp + 24], eax ; br->read_crc <- crc
+ mov [ebp + 28], dword 0 ; br->crc16_align <- 0
+ pop ecx
+ pop edi
+
+ add esi, 1 ; cwords++;
+ xor ecx, ecx ; cbits = 0;
+ ; }
+ jmp near .break1 ; goto break1;
+
+.c1_next2: ; } else {
+ ;; ecx cbits
+ ;; edx current brword 'b'
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ add edi, 32
+ sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits;
+ ; crc16_update_word_(br, br->buffer[cwords]);
+ push edi ; [need more registers]
+ push ecx ; [need more registers]
+ bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
+ mov ecx, [ebp + 28] ; ecx <- br->crc16_align
+ mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
+ xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero]
+ mov edi, FLAC__crc16_table
+ ;; eax (ax) crc a.k.a. br->read_crc
+ ;; ebx (bl) intermediate result index into FLAC__crc16_table[]
+ ;; ecx br->crc16_align
+ ;; edx byteswapped brword to CRC
+ ;; esi cwords
+ ;; edi unsigned FLAC__crc16_table[]
+ ;; ebp br
+ test ecx, ecx ; switch(br->crc16_align) ...
+ jz .c1b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
+ cmp ecx, 8
+ je .c1b1
+ shr edx, 16
+ cmp ecx, 16
+ je .c1b2
+ jmp .c1b3
+.c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
+.c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shr edx, 16
+.c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+.c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ mov [ebp + 24], eax ; br->read_crc <- crc
+ mov [ebp + 28], dword 0 ; br->crc16_align <- 0
+ pop ecx
+ pop edi
+
+ add esi, 1 ; cwords++;
+ xor ecx, ecx ; cbits = 0;
+ ; /* didn't find stop bit yet, have to keep going... */
+ ; }
+
+ cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */
+ jb near .c1_loop
+
+.c1_next1:
+ ; at this point we've eaten up all the whole words; have to try
+ ; reading through any tail bytes before calling the read callback.
+ ; this is a repeat of the above logic adjusted for the fact we
+ ; don't have a whole word. note though if the client is feeding
+ ; us data a byte at a time (unlikely), br->consumed_bits may not
+ ; be zero.
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ mov edx, [ebp + 12] ; edx <- br->bytes
+ test edx, edx
+ jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ]
+ mov ebx, [ebp]
+ shl edx, 3 ; edx <- const unsigned end = br->bytes * 8;
+ mov eax, [ebx + 4*esi] ; b = br->buffer[cwords]
+ xchg edx, ecx ; [edx <- cbits , ecx <- end]
+ mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES
+ shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end
+ not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end)
+ xchg edx, ecx ; [edx <- end , ecx <- cbits]
+ and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
+ shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
+ test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
+ jz .c1_next3 ; if(b) {
+ bsr ebx, eax
+ not ebx
+ and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax)
+ add ecx, ebx ; cbits += i;
+ add edi, ebx ; uval += i;
+ add ecx, 1 ; cbits++; /* skip over stop bit */
+ jmp short .break1 ; goto break1;
+.c1_next3: ; } else {
+ sub edi, ecx
+ add edi, edx ; uval += end - cbits;
+ add ecx, edx ; cbits += end
+ ; /* didn't find stop bit yet, have to keep going... */
+ ; }
+ ; }
+.read1:
+ ; flush registers and read; bitreader_read_from_client_() does
+ ; not touch br->consumed_bits at all but we still need to set
+ ; it in case it fails and we have to return false.
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ mov [ebp + 16], esi ; br->consumed_words = cwords;
+ mov [ebp + 20], ecx ; br->consumed_bits = cbits;
+ push ecx ; /* save */
+ push ebp ; /* push br argument */
+ call bitreader_read_from_client_
+ pop edx ; /* discard, unused */
+ pop ecx ; /* restore */
+ mov esi, [ebp + 16] ; cwords = br->consumed_words;
+ ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+ mov ebx, [ebp + 8] ; ebx <- br->words
+ sub ebx, esi ; ebx <- br->words-cwords
+ shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
+ add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
+ shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
+ sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
+ add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
+ ; + uval to offset our count by the # of unary bits already
+ ; consumed before the read, because we will add these back
+ ; in all at once at break1
+ mov [esp], ebx ; ucbits <- ebx
+ test eax, eax ; if(!bitreader_read_from_client_(br))
+ jnz near .unary_loop
+ jmp .end ; return false; /* eax (the return value) is already 0 */
+ ; } /* end while(1) unary part */
+
+ ALIGN 16
+.break1:
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ ;; [esp] ucbits
+ sub [esp], edi ; ucbits -= uval;
+ sub dword [esp], 1 ; ucbits--; /* account for stop bit */
+
+ ;
+ ; read binary part
+ ;
+ mov ebx, [esp + 36] ; ebx <- parameter
+ test ebx, ebx ; if(parameter) {
+ jz near .break2
+.read2:
+ cmp [esp], ebx ; while(ucbits < parameter) {
+ jae .c2_next1
+ ; flush registers and read; bitreader_read_from_client_() does
+ ; not touch br->consumed_bits at all but we still need to set
+ ; it in case it fails and we have to return false.
+ mov [ebp + 16], esi ; br->consumed_words = cwords;
+ mov [ebp + 20], ecx ; br->consumed_bits = cbits;
+ push ecx ; /* save */
+ push ebp ; /* push br argument */
+ call bitreader_read_from_client_
+ pop edx ; /* discard, unused */
+ pop ecx ; /* restore */
+ mov esi, [ebp + 16] ; cwords = br->consumed_words;
+ ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+ mov edx, [ebp + 8] ; edx <- br->words
+ sub edx, esi ; edx <- br->words-cwords
+ shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
+ add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
+ shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
+ sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
+ mov [esp], edx ; ucbits <- edx
+ test eax, eax ; if(!bitreader_read_from_client_(br))
+ jnz .read2
+ jmp .end ; return false; /* eax (the return value) is already 0 */
+ ; }
+.c2_next1:
+ ;; ebx parameter
+ ;; ecx cbits
+ ;; esi cwords
+ ;; edi uval
+ ;; ebp br
+ ;; [esp] ucbits
+ cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+ jae near .c2_next2
+ test ecx, ecx ; if(cbits) {
+ jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+ mov eax, 32
+ mov edx, [ebp]
+ sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits;
+ mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords];
+ cmp ebx, eax ; if(parameter < n) {
+ jae .c2_next4
+ ; uval <<= parameter;
+ ; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
+ shl edx, cl
+ xchg ebx, ecx
+ shld edi, edx, cl
+ add ebx, ecx ; cbits += parameter;
+ xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
+ jmp .break2 ; goto break2;
+ ; }
+.c2_next4:
+ ; uval <<= n;
+ ; uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
+%if 1
+ rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
+ ; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
+%else
+ shl edx, cl
+%endif
+ xchg eax, ecx
+ shld edi, edx, cl
+ xchg eax, ecx
+%if 1
+ ror edx, cl ; restored.
+%else
+ mov edx, [ebp]
+ mov edx, [edx + 4*esi]
+%endif
+ ; crc16_update_word_(br, br->buffer[cwords]);
+ push edi ; [need more registers]
+ push ebx ; [need more registers]
+ push ecx ; [need more registers]
+ push eax ; [need more registers]
+ bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
+ mov ecx, [ebp + 28] ; ecx <- br->crc16_align
+ mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc)
+ xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero]
+ mov edi, FLAC__crc16_table
+ ;; eax (ax) crc a.k.a. br->read_crc
+ ;; ebx (bl) intermediate result index into FLAC__crc16_table[]
+ ;; ecx br->crc16_align
+ ;; edx byteswapped brword to CRC
+ ;; esi cwords
+ ;; edi unsigned FLAC__crc16_table[]
+ ;; ebp br
+ test ecx, ecx ; switch(br->crc16_align) ...
+ jz .c2b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case]
+ cmp ecx, 8
+ je .c2b1
+ shr edx, 16
+ cmp ecx, 16
+ je .c2b2
+ jmp .c2b3
+.c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24)
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
+.c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff))
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
+ shr edx, 16
+.c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff))
+ movzx ebx, dl
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
+.c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff)
+ movzx ebx, dh
+ mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ shl ax, 8 ; ax <- (crc<<8)
+ xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
+ mov [ebp + 24], eax ; br->read_crc <- crc
+ mov [ebp + 28], dword 0 ; br->crc16_align <- 0
+ pop eax
+ pop ecx
+ pop ebx
+ pop edi
+ add esi, 1 ; cwords++;
+ mov ecx, ebx
+ sub ecx, eax ; cbits = parameter - n;
+ jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
+ ; uval <<= cbits;
+ ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
+ mov eax, [ebp]
+ mov eax, [eax + 4*esi]
+ shld edi, eax, cl
+ ; }
+ jmp .break2 ; goto break2;
+.c2_next3: ; } else {
+ mov ecx, ebx ; cbits = parameter;
+ ; uval <<= cbits;
+ ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
+ mov eax, [ebp]
+ mov eax, [eax + 4*esi]
+ shld edi, eax, cl
+ jmp .break2 ; goto break2;
+ ; }
+.c2_next2: ; } else {
+ ; in this case we're starting our read at a partial tail word;
+ ; the reader has guaranteed that we have at least 'parameter'
+ ; bits available to read, which makes this case simpler.
+ ; uval <<= parameter;
+ ; if(cbits) {
+ ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+ ; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
+ ; cbits += parameter;
+ ; goto break2;
+ ; } else {
+ ; cbits = parameter;
+ ; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
+ ; goto break2;
+ ; }
+ ; the above is much shorter in assembly:
+ mov eax, [ebp]
+ mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords]
+ shl eax, cl ; eax <- br->buffer[cwords] << cbits
+ add ecx, ebx ; cbits += parameter
+ xchg ebx, ecx ; ebx <- cbits, ecx <- parameter
+ shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word
+ xchg ebx, ecx ; ebx <- parameter, ecx <- cbits
+ ; }
+ ; }
+.break2:
+ sub [esp], ebx ; ucbits -= parameter;
+
+ ;
+ ; compose the value
+ ;
+ mov ebx, [esp + 28] ; ebx <- vals
+ mov edx, edi ; edx <- uval
+ and edi, 1 ; edi <- uval & 1
+ shr edx, 1 ; edx <- uval >> 1
+ neg edi ; edi <- -(int)(uval & 1)
+ xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1))
+ mov [ebx], edx ; *vals <- edx
+ sub dword [esp + 32], 1 ; --nvals;
+ jz .finished ; if(nvals == 0) /* jump to finish */
+ xor edi, edi ; uval = 0;
+ add dword [esp + 28], 4 ; ++vals
+ jmp .val_loop ; }
+
+.finished:
+ mov [ebp + 16], esi ; br->consumed_words = cwords;
+ mov [ebp + 20], ecx ; br->consumed_bits = cbits;
+ mov eax, 1
+.end:
+ add esp, 4
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+end
+
+%ifdef OBJ_FORMAT_elf
+ section .note.GNU-stack noalloc
+%endif
diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h
index d08d650c..1ad97421 100644
--- a/src/libFLAC/include/private/bitreader.h
+++ b/src/libFLAC/include/private/bitreader.h
@@ -81,6 +81,13 @@ FLAC__bool FLAC__bitreader_read_byte_block_aligned_no_crc(FLAC__BitReader *br, F
FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *val);
FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, unsigned parameter);
FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
+#ifndef FLAC__NO_ASM
+# ifdef FLAC__CPU_IA32
+# ifdef FLAC__HAS_NASM
+FLAC__bool FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
+# endif
+# endif
+#endif
#if 0 /* UNUSED */
FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter);
FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, unsigned *val, unsigned parameter);
@@ -88,4 +95,5 @@ FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, unsigned *v
FLAC__bool FLAC__bitreader_read_utf8_uint32(FLAC__BitReader *br, FLAC__uint32 *val, FLAC__byte *raw, unsigned *rawlen);
FLAC__bool FLAC__bitreader_read_utf8_uint64(FLAC__BitReader *br, FLAC__uint64 *val, FLAC__byte *raw, unsigned *rawlen);
+FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);//@@@@@@
#endif
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
index f0edb111..48636ff6 100644
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@@ -166,6 +166,7 @@ typedef struct FLAC__StreamDecoderPrivate {
void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit), AND order <= 8: */
void (*local_lpc_restore_signal_16bit_order8)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+ FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
void *client_data;
FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
FLAC__BitReader *input;
@@ -413,12 +414,15 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal;
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block;
/* now override with asm where appropriate */
#ifndef FLAC__NO_ASM
if(decoder->private_->cpuinfo.use_asm) {
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
+ if(decoder->private_->cpuinfo.data.ia32.bswap)
+ decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
if(decoder->private_->cpuinfo.data.ia32.mmx) {
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
@@ -2729,7 +2733,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, unsigne
partitioned_rice_contents->parameters[partition] = rice_parameter;
if(rice_parameter < FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE_ESCAPE_PARAMETER) {
u = (partition_order == 0 || partition > 0)? partition_samples : partition_samples - predictor_order;
- if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
+ if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
return false; /* read_callback_ sets the state for us */
sample += u;
}