diff options
author | Josh Coalson <jcoalson@users.sourceforce.net> | 2007-03-22 03:22:27 +0000 |
---|---|---|
committer | Josh Coalson <jcoalson@users.sourceforce.net> | 2007-03-22 03:22:27 +0000 |
commit | ddddff6a5604da5c7223a075e58ca532d7ad375d (patch) | |
tree | bd4d30cce639f0d815e562b484c18e8a9e4d8fae /src/libFLAC | |
parent | e4b1706412b37e0b74ce8cc712598d5cccdf9d6e (diff) | |
download | flac-ddddff6a5604da5c7223a075e58ca532d7ad375d.tar.gz |
add FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap()
Diffstat (limited to 'src/libFLAC')
-rw-r--r-- | src/libFLAC/Makefile.lite | 1 | ||||
-rw-r--r-- | src/libFLAC/ia32/Makefile.am | 1 | ||||
-rw-r--r-- | src/libFLAC/ia32/bitreader_asm.nasm | 545 | ||||
-rw-r--r-- | src/libFLAC/include/private/bitreader.h | 8 | ||||
-rw-r--r-- | src/libFLAC/stream_decoder.c | 6 |
5 files changed, 560 insertions, 1 deletions
diff --git a/src/libFLAC/Makefile.lite b/src/libFLAC/Makefile.lite index 2d08e710..54188458 100644 --- a/src/libFLAC/Makefile.lite +++ b/src/libFLAC/Makefile.lite @@ -54,6 +54,7 @@ else ifeq ($(SOLARIS_BUILD),yes) else SRCS_NASM = \ + ia32/bitreader_asm.nasm \ ia32/cpu_asm.nasm \ ia32/fixed_asm.nasm \ ia32/lpc_asm.nasm diff --git a/src/libFLAC/ia32/Makefile.am b/src/libFLAC/ia32/Makefile.am index 5f010f6a..476633fe 100644 --- a/src/libFLAC/ia32/Makefile.am +++ b/src/libFLAC/ia32/Makefile.am @@ -37,6 +37,7 @@ STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh noinst_LTLIBRARIES = libFLAC-asm.la libFLAC_asm_la_SOURCES = \ + bitreader_asm.nasm \ cpu_asm.nasm \ fixed_asm.nasm \ lpc_asm.nasm \ diff --git a/src/libFLAC/ia32/bitreader_asm.nasm b/src/libFLAC/ia32/bitreader_asm.nasm new file mode 100644 index 00000000..1f976e65 --- /dev/null +++ b/src/libFLAC/ia32/bitreader_asm.nasm @@ -0,0 +1,545 @@ +; vim:filetype=nasm ts=8 + +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "nasm.h" + + data_section + +extern FLAC__crc16_table ; unsigned FLAC__crc16_table[256]; +extern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br); + +cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap + + code_section + + +; ********************************************************************** +; +; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter) +; +; Some details like assertions and other checking is performed by the caller. + ALIGN 16 +cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap + + ;ASSERT(0 != br); + ;ASSERT(0 != br->buffer); + ; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion + ;ASSERT(FLAC__BITS_PER_WORD == 32); + ;ASSERT(parameter < 32); + ; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it + + ;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time + ;; [esp + 16] unsigned parameter + ;; [esp + 12] unsigned nvals + ;; [esp + 8] int vals[] + ;; [esp + 4] FLAC__BitReader *br + mov eax, [esp + 12] ; if(nvals == 0) + test eax, eax + ja .nvals_gt_0 + mov eax, 1 ; return true; + ret + +.nvals_gt_0: + push ebp + push ebx + push esi + push edi + sub esp, 4 + ;; [esp + 36] unsigned parameter + ;; [esp + 32] unsigned nvals + ;; [esp + 28] int vals[] + ;; [esp + 24] FLAC__BitReader *br + ;; [esp] ucbits + mov ebp, [esp + 24] ; ebp <- br == br->buffer + mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version) + mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version) + xor edi, edi ; edi <- 0 'uval' + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + ;; [ebp] br->buffer + ;; [ebp + 8] br->words + ;; [ebp + 12] br->bytes + ;; [ebp + 16] br->consumed_words + ;; [ebp + 20] br->consumed_bits + ;; [ebp + 24] br->read_crc + ;; [ebp + 28] br->crc16_align + + ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; + mov eax, [ebp + 8] ; eax <- br->words + sub eax, esi ; eax <- br->words-cwords + shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes + shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 + sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + mov [esp], eax ; ucbits <- eax + + ALIGN 16 +.val_loop: ; while(1) { + + ; + ; read unary part + ; +.unary_loop: ; while(1) { + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ + jae near .c1_next1 +.c1_loop: ; { + mov ebx, [ebp] + mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] + mov edx, eax ; edx = br->buffer[cwords] (saved for later use) + shl eax, cl ; b = br->buffer[cwords] << cbits + test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) + jz near .c1_next2 ; if(b) { + bsr ebx, eax + not ebx + and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) + add ecx, ebx ; cbits += i; + add edi, ebx ; uval += i; + add ecx, 1 ; cbits++; /* skip over stop bit */ + test ecx, ~31 + jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */ + ; crc16_update_word_(br, br->buffer[cwords]); + push edi ; [need more registers] + push ecx ; [need more registers] + bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier + mov ecx, [ebp + 28] ; ecx <- br->crc16_align + mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) + xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero] + mov edi, FLAC__crc16_table + ;; eax (ax) crc a.k.a. br->read_crc + ;; ebx (bl) intermediate result index into FLAC__crc16_table[] + ;; ecx br->crc16_align + ;; edx byteswapped brword to CRC + ;; esi cwords + ;; edi unsigned FLAC__crc16_table[] + ;; ebp br + test ecx, ecx ; switch(br->crc16_align) ... + jz .c0b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] + cmp ecx, 8 + je .c0b1 + shr edx, 16 + cmp ecx, 16 + je .c0b2 + jmp .c0b3 +.c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] +.c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shr edx, 16 +.c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] +.c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] + mov [ebp + 24], eax ; br->read_crc <- crc + mov [ebp + 28], dword 0 ; br->crc16_align <- 0 + pop ecx + pop edi + + add esi, 1 ; cwords++; + xor ecx, ecx ; cbits = 0; + ; } + jmp near .break1 ; goto break1; + +.c1_next2: ; } else { + ;; ecx cbits + ;; edx current brword 'b' + ;; esi cwords + ;; edi uval + ;; ebp br + add edi, 32 + sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits; + ; crc16_update_word_(br, br->buffer[cwords]); + push edi ; [need more registers] + push ecx ; [need more registers] + bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier + mov ecx, [ebp + 28] ; ecx <- br->crc16_align + mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) + xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero] + mov edi, FLAC__crc16_table + ;; eax (ax) crc a.k.a. br->read_crc + ;; ebx (bl) intermediate result index into FLAC__crc16_table[] + ;; ecx br->crc16_align + ;; edx byteswapped brword to CRC + ;; esi cwords + ;; edi unsigned FLAC__crc16_table[] + ;; ebp br + test ecx, ecx ; switch(br->crc16_align) ... + jz .c1b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] + cmp ecx, 8 + je .c1b1 + shr edx, 16 + cmp ecx, 16 + je .c1b2 + jmp .c1b3 +.c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] +.c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shr edx, 16 +.c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] +.c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] + mov [ebp + 24], eax ; br->read_crc <- crc + mov [ebp + 28], dword 0 ; br->crc16_align <- 0 + pop ecx + pop edi + + add esi, 1 ; cwords++; + xor ecx, ecx ; cbits = 0; + ; /* didn't find stop bit yet, have to keep going... */ + ; } + + cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ + jb near .c1_loop + +.c1_next1: + ; at this point we've eaten up all the whole words; have to try + ; reading through any tail bytes before calling the read callback. + ; this is a repeat of the above logic adjusted for the fact we + ; don't have a whole word. note though if the client is feeding + ; us data a byte at a time (unlikely), br->consumed_bits may not + ; be zero. + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + mov edx, [ebp + 12] ; edx <- br->bytes + test edx, edx + jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ] + mov ebx, [ebp] + shl edx, 3 ; edx <- const unsigned end = br->bytes * 8; + mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] + xchg edx, ecx ; [edx <- cbits , ecx <- end] + mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES + shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end + not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end) + xchg edx, ecx ; [edx <- end , ecx <- cbits] + and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)); + shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits; + test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) + jz .c1_next3 ; if(b) { + bsr ebx, eax + not ebx + and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) + add ecx, ebx ; cbits += i; + add edi, ebx ; uval += i; + add ecx, 1 ; cbits++; /* skip over stop bit */ + jmp short .break1 ; goto break1; +.c1_next3: ; } else { + sub edi, ecx + add edi, edx ; uval += end - cbits; + add ecx, edx ; cbits += end + ; /* didn't find stop bit yet, have to keep going... */ + ; } + ; } +.read1: + ; flush registers and read; bitreader_read_from_client_() does + ; not touch br->consumed_bits at all but we still need to set + ; it in case it fails and we have to return false. + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + mov [ebp + 16], esi ; br->consumed_words = cwords; + mov [ebp + 20], ecx ; br->consumed_bits = cbits; + push ecx ; /* save */ + push ebp ; /* push br argument */ + call bitreader_read_from_client_ + pop edx ; /* discard, unused */ + pop ecx ; /* restore */ + mov esi, [ebp + 16] ; cwords = br->consumed_words; + ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; + mov ebx, [ebp + 8] ; ebx <- br->words + sub ebx, esi ; ebx <- br->words-cwords + shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes + shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 + sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval + ; + uval to offset our count by the # of unary bits already + ; consumed before the read, because we will add these back + ; in all at once at break1 + mov [esp], ebx ; ucbits <- ebx + test eax, eax ; if(!bitreader_read_from_client_(br)) + jnz near .unary_loop + jmp .end ; return false; /* eax (the return value) is already 0 */ + ; } /* end while(1) unary part */ + + ALIGN 16 +.break1: + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + ;; [esp] ucbits + sub [esp], edi ; ucbits -= uval; + sub dword [esp], 1 ; ucbits--; /* account for stop bit */ + + ; + ; read binary part + ; + mov ebx, [esp + 36] ; ebx <- parameter + test ebx, ebx ; if(parameter) { + jz near .break2 +.read2: + cmp [esp], ebx ; while(ucbits < parameter) { + jae .c2_next1 + ; flush registers and read; bitreader_read_from_client_() does + ; not touch br->consumed_bits at all but we still need to set + ; it in case it fails and we have to return false. + mov [ebp + 16], esi ; br->consumed_words = cwords; + mov [ebp + 20], ecx ; br->consumed_bits = cbits; + push ecx ; /* save */ + push ebp ; /* push br argument */ + call bitreader_read_from_client_ + pop edx ; /* discard, unused */ + pop ecx ; /* restore */ + mov esi, [ebp + 16] ; cwords = br->consumed_words; + ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; + mov edx, [ebp + 8] ; edx <- br->words + sub edx, esi ; edx <- br->words-cwords + shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes + shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 + sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + mov [esp], edx ; ucbits <- edx + test eax, eax ; if(!bitreader_read_from_client_(br)) + jnz .read2 + jmp .end ; return false; /* eax (the return value) is already 0 */ + ; } +.c2_next1: + ;; ebx parameter + ;; ecx cbits + ;; esi cwords + ;; edi uval + ;; ebp br + ;; [esp] ucbits + cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */ + jae near .c2_next2 + test ecx, ecx ; if(cbits) { + jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ + mov eax, 32 + mov edx, [ebp] + sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits; + mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords]; + cmp ebx, eax ; if(parameter < n) { + jae .c2_next4 + ; uval <<= parameter; + ; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter); + shl edx, cl + xchg ebx, ecx + shld edi, edx, cl + add ebx, ecx ; cbits += parameter; + xchg ebx, ecx ; ebx <- parameter, ecx <- cbits + jmp .break2 ; goto break2; + ; } +.c2_next4: + ; uval <<= n; + ; uval |= word & (FLAC__WORD_ALL_ONES >> cbits); +%if 1 + rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing + ; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again +%else + shl edx, cl +%endif + xchg eax, ecx + shld edi, edx, cl + xchg eax, ecx +%if 1 + ror edx, cl ; restored. +%else + mov edx, [ebp] + mov edx, [edx + 4*esi] +%endif + ; crc16_update_word_(br, br->buffer[cwords]); + push edi ; [need more registers] + push ebx ; [need more registers] + push ecx ; [need more registers] + push eax ; [need more registers] + bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier + mov ecx, [ebp + 28] ; ecx <- br->crc16_align + mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) + xor ebx, ebx ; [code from here down assumes and requires that the top 24 bits of ebx stay zero] + mov edi, FLAC__crc16_table + ;; eax (ax) crc a.k.a. br->read_crc + ;; ebx (bl) intermediate result index into FLAC__crc16_table[] + ;; ecx br->crc16_align + ;; edx byteswapped brword to CRC + ;; esi cwords + ;; edi unsigned FLAC__crc16_table[] + ;; ebp br + test ecx, ecx ; switch(br->crc16_align) ... + jz .c2b0 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] + cmp ecx, 8 + je .c2b1 + shr edx, 16 + cmp ecx, 16 + je .c2b2 + jmp .c2b3 +.c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] +.c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] + shr edx, 16 +.c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) + movzx ebx, dl + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] +.c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) + movzx ebx, dh + mov cx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] + shl ax, 8 ; ax <- (crc<<8) + xor ax, cx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] + mov [ebp + 24], eax ; br->read_crc <- crc + mov [ebp + 28], dword 0 ; br->crc16_align <- 0 + pop eax + pop ecx + pop ebx + pop edi + add esi, 1 ; cwords++; + mov ecx, ebx + sub ecx, eax ; cbits = parameter - n; + jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */ + ; uval <<= cbits; + ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); + mov eax, [ebp] + mov eax, [eax + 4*esi] + shld edi, eax, cl + ; } + jmp .break2 ; goto break2; +.c2_next3: ; } else { + mov ecx, ebx ; cbits = parameter; + ; uval <<= cbits; + ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); + mov eax, [ebp] + mov eax, [eax + 4*esi] + shld edi, eax, cl + jmp .break2 ; goto break2; + ; } +.c2_next2: ; } else { + ; in this case we're starting our read at a partial tail word; + ; the reader has guaranteed that we have at least 'parameter' + ; bits available to read, which makes this case simpler. + ; uval <<= parameter; + ; if(cbits) { + ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ + ; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter); + ; cbits += parameter; + ; goto break2; + ; } else { + ; cbits = parameter; + ; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits); + ; goto break2; + ; } + ; the above is much shorter in assembly: + mov eax, [ebp] + mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords] + shl eax, cl ; eax <- br->buffer[cwords] << cbits + add ecx, ebx ; cbits += parameter + xchg ebx, ecx ; ebx <- cbits, ecx <- parameter + shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word + xchg ebx, ecx ; ebx <- parameter, ecx <- cbits + ; } + ; } +.break2: + sub [esp], ebx ; ucbits -= parameter; + + ; + ; compose the value + ; + mov ebx, [esp + 28] ; ebx <- vals + mov edx, edi ; edx <- uval + and edi, 1 ; edi <- uval & 1 + shr edx, 1 ; edx <- uval >> 1 + neg edi ; edi <- -(int)(uval & 1) + xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1)) + mov [ebx], edx ; *vals <- edx + sub dword [esp + 32], 1 ; --nvals; + jz .finished ; if(nvals == 0) /* jump to finish */ + xor edi, edi ; uval = 0; + add dword [esp + 28], 4 ; ++vals + jmp .val_loop ; } + +.finished: + mov [ebp + 16], esi ; br->consumed_words = cwords; + mov [ebp + 20], ecx ; br->consumed_bits = cbits; + mov eax, 1 +.end: + add esp, 4 + pop edi + pop esi + pop ebx + pop ebp + ret + +end + +%ifdef OBJ_FORMAT_elf + section .note.GNU-stack noalloc +%endif diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h index d08d650c..1ad97421 100644 --- a/src/libFLAC/include/private/bitreader.h +++ b/src/libFLAC/include/private/bitreader.h @@ -81,6 +81,13 @@ FLAC__bool FLAC__bitreader_read_byte_block_aligned_no_crc(FLAC__BitReader *br, F FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *val); FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, unsigned parameter); FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter); +#ifndef FLAC__NO_ASM +# ifdef FLAC__CPU_IA32 +# ifdef FLAC__HAS_NASM +FLAC__bool FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter); +# endif +# endif +#endif #if 0 /* UNUSED */ FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter); FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, unsigned *val, unsigned parameter); @@ -88,4 +95,5 @@ FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, unsigned *v FLAC__bool FLAC__bitreader_read_utf8_uint32(FLAC__BitReader *br, FLAC__uint32 *val, FLAC__byte *raw, unsigned *rawlen); FLAC__bool FLAC__bitreader_read_utf8_uint64(FLAC__BitReader *br, FLAC__uint64 *val, FLAC__byte *raw, unsigned *rawlen); +FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);//@@@@@@ #endif diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c index f0edb111..48636ff6 100644 --- a/src/libFLAC/stream_decoder.c +++ b/src/libFLAC/stream_decoder.c @@ -166,6 +166,7 @@ typedef struct FLAC__StreamDecoderPrivate { void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); /* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit), AND order <= 8: */ void (*local_lpc_restore_signal_16bit_order8)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); + FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter); void *client_data; FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */ FLAC__BitReader *input; @@ -413,12 +414,15 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide; decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal; decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal; + decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block; /* now override with asm where appropriate */ #ifndef FLAC__NO_ASM if(decoder->private_->cpuinfo.use_asm) { #ifdef FLAC__CPU_IA32 FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32); #ifdef FLAC__HAS_NASM + if(decoder->private_->cpuinfo.data.ia32.bswap) + decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap; if(decoder->private_->cpuinfo.data.ia32.mmx) { decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32; decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx; @@ -2729,7 +2733,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, unsigne partitioned_rice_contents->parameters[partition] = rice_parameter; if(rice_parameter < FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE_ESCAPE_PARAMETER) { u = (partition_order == 0 || partition > 0)? partition_samples : partition_samples - predictor_order; - if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)) + if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)) return false; /* read_callback_ sets the state for us */ sample += u; } |