; libFLAC - Free Lossless Audio Codec library
; Copyright (C) 2001,2002  Josh Coalson
;
; This library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Library General Public
; License as published by the Free Software Foundation; either
; version 2 of the License, or (at your option) any later version.
;
; This library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; Library General Public License for more details.
;
; You should have received a copy of the GNU Library General Public
; License along with this library; if not, write to the
; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
; Boston, MA  02111-1307, USA.

; [CR] is a note to flag that the instruction can be easily reordered

%include "nasm.h"

	data_section

cglobal FLAC__lpc_compute_autocorrelation_asm

	code_section

; **********************************************************************
;
; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
; {
;	FLAC__real d;
;	unsigned sample, coeff;
;	const unsigned limit = data_len - lag;
;
;	assert(lag > 0);
;	assert(lag <= data_len);
;
;	for(coeff = 0; coeff < lag; coeff++)
;		autoc[coeff] = 0.0;
;	for(sample = 0; sample <= limit; sample++){
;		d = data[sample];
;		for(coeff = 0; coeff < lag; coeff++)
;			autoc[coeff] += d * data[sample+coeff];
;	}
;	for(; sample < data_len; sample++){
;		d = data[sample];
;		for(coeff = 0; coeff < data_len - sample; coeff++)
;			autoc[coeff] += d * data[sample+coeff];
;	}
; }
;
FLAC__lpc_compute_autocorrelation_asm:

	push	ebp
	lea	ebp, [esp + 8]
	push	ebx
	push	esi
	push	edi

	mov	edx, [ebp + 8]			; edx == lag
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc

	cmp	edx, 1
	ja	short .lag_above_1
.lag_eq_1:
	fldz					; will accumulate autoc[0]
	ALIGN 16
.lag_1_loop:
	fld	dword [esi]
	add	esi, byte 4			; sample++
	fmul	st0, st0
	faddp	st1, st0
	dec	ecx
	jnz	.lag_1_loop
	fstp	dword [edi]
	jmp	.end

.lag_above_1:
	cmp	edx, 2
	ja	short .lag_above_2
.lag_eq_2:
	fldz					; will accumulate autoc[1]
	dec	ecx
	fldz					; will accumulate autoc[0]
	fld	dword [esi]
	ALIGN 16
.lag_2_loop:
	add	esi, byte 4			; [CR] sample++
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi]
	fmul	st1, st0
	fxch
	faddp	st3, st0			; add to autoc[1]
	dec	ecx
	jnz	.lag_2_loop
	; clean up the leftovers
	fmul	st0, st0
	faddp	st1, st0			; add to autoc[0]
	fstp	dword [edi]
	fstp	dword [edi + 4]
	jmp	.end

.lag_above_2:
	cmp	edx, 3
	ja	short .lag_above_3
.lag_eq_3:
	fldz					; will accumulate autoc[2]
	dec	ecx
	fldz					; will accumulate autoc[1]
	dec	ecx
	fldz					; will accumulate autoc[0]
	ALIGN 16
.lag_3_loop:
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[2]
	dec	ecx
	jnz	.lag_3_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st1, st0
	fxch
	faddp	st3, st0			; add to autoc[1]
	fmul	st0, st0
	faddp	st1, st0			; add to autoc[0]
	fstp	dword [edi]
	fstp	dword [edi + 4]
	fstp	dword [edi + 8]
	jmp	.end

.lag_above_3:
	cmp	edx, 4
	ja	near .lag_above_4
.lag_eq_4:
	fldz					; will accumulate autoc[3]
	dec	ecx
	fldz					; will accumulate autoc[2]
	dec	ecx
	fldz					; will accumulate autoc[1]
	dec	ecx
	fldz					; will accumulate autoc[0]
	ALIGN 16
.lag_4_loop:
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[3]
	dec	ecx
	jnz	.lag_4_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[2]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st1, st0
	fxch
	faddp	st3, st0			; add to autoc[1]
	fmul	st0, st0
	faddp	st1, st0			; add to autoc[0]
	fstp	dword [edi]
	fstp	dword [edi + 4]
	fstp	dword [edi + 8]
	fstp	dword [edi + 12]
	jmp	.end

.lag_above_4:
	cmp	edx, 5
	ja	near .lag_above_5
.lag_eq_5:
	fldz					; will accumulate autoc[4]
	fldz					; will accumulate autoc[3]
	fldz					; will accumulate autoc[2]
	fldz					; will accumulate autoc[1]
	fldz					; will accumulate autoc[0]
	sub	ecx, byte 4
	ALIGN 16
.lag_5_loop:
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[3]
	fld	dword [esi + 16]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st5, st0			; add to autoc[4]
	dec	ecx
	jnz	.lag_5_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[3]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[2]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st1, st0
	fxch
	faddp	st3, st0			; add to autoc[1]
	fmul	st0, st0
	faddp	st1, st0			; add to autoc[0]
	fstp	dword [edi]
	fstp	dword [edi + 4]
	fstp	dword [edi + 8]
	fstp	dword [edi + 12]
	fstp	dword [edi + 16]
	jmp	.end

.lag_above_5:
	cmp	edx, 6
	ja	.lag_above_6
.lag_eq_6:
	fldz					; will accumulate autoc[5]
	fldz					; will accumulate autoc[4]
	fldz					; will accumulate autoc[3]
	fldz					; will accumulate autoc[2]
	fldz					; will accumulate autoc[1]
	fldz					; will accumulate autoc[0]
	sub	ecx, byte 5
	ALIGN 16
.lag_6_loop:
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[3]
	fld	dword [esi + 16]
	fmul	st0, st1
	faddp	st6, st0			; add to autoc[4]
	fld	dword [esi + 20]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st6, st0			; add to autoc[5]
	dec	ecx
	jnz	.lag_6_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[3]
	fld	dword [esi + 16]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st5, st0			; add to autoc[4]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[2]
	fld	dword [esi + 12]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[3]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[1]
	fld	dword [esi + 8]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[2]
	fld	dword [esi]
	fld	st0
	fmul	st0, st0
	faddp	st2, st0			; add to autoc[0]
	fld	dword [esi + 4]
	fmul	st1, st0
	fxch
	faddp	st3, st0			; add to autoc[1]
	fmul	st0, st0
	faddp	st1, st0			; add to autoc[0]
	fstp	dword [edi]
	fstp	dword [edi + 4]
	fstp	dword [edi + 8]
	fstp	dword [edi + 12]
	fstp	dword [edi + 16]
	fstp	dword [edi + 20]
	jmp	.end

.lag_above_6:
	;	for(coeff = 0; coeff < lag; coeff++)
	;		autoc[coeff] = 0.0;
	lea	ecx, [edx * 2]			; ecx = # of dwords of 0 to write
	xor	eax, eax
	rep	stosd
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	edi, [ebp + 12]			; edi == autoc
	;	const unsigned limit = data_len - lag;
	sub	ecx, edx
	inc	ecx				; we are looping <= limit so we add one to the counter
	;	for(sample = 0; sample <= limit; sample++){
	;		d = data[sample];
	;		for(coeff = 0; coeff < lag; coeff++)
	;			autoc[coeff] += d * data[sample+coeff];
	;	}
	xor	eax, eax			; eax == sample <- 0
	ALIGN 16
.outer_loop:
	push	eax				; save sample
	fld	dword [esi + eax * 4]		; ST = d <- data[sample]
	mov	ebx, eax			; ebx == sample+coeff <- sample
	mov	edx, [ebp + 8]			; edx <- lag
	xor	eax, eax			; eax == coeff <- 0
	ALIGN 16
.inner_loop:
	fld	st0				; ST = d d
	fmul	dword [esi + ebx * 4]		; ST = d*data[sample+coeff] d
	fadd	dword [edi + eax * 4]		; ST = autoc[coeff]+d*data[sample+coeff] d
	fstp	dword [edi + eax * 4]		; autoc[coeff]+=d*data[sample+coeff]  ST = d
	inc	ebx				; (sample+coeff)++
	inc	eax				; coeff++
	dec	edx
	jnz	.inner_loop
	pop	eax				; restore sample
	fstp	st0				; pop d, ST = empty
	inc	eax				; sample++
	loop	.outer_loop
	;	for(; sample < data_len; sample++){
	;		d = data[sample];
	;		for(coeff = 0; coeff < data_len - sample; coeff++)
	;			autoc[coeff] += d * data[sample+coeff];
	;	}
	mov	ecx, [ebp + 8]			; ecx <- lag
	dec	ecx				; ecx <- lag - 1
	jz	.outer_end			; skip loop if 0
.outer_loop2:
	push	eax				; save sample
	fld	dword [esi + eax * 4]		; ST = d <- data[sample]
	mov	ebx, eax			; ebx == sample+coeff <- sample
	mov	edx, [ebp + 4]			; edx <- data_len
	sub	edx, eax			; edx <- data_len-sample
	xor	eax, eax			; eax == coeff <- 0
.inner_loop2:
	fld	st0				; ST = d d
	fmul	dword [esi + ebx * 4]		; ST = d*data[sample+coeff] d
	fadd	dword [edi + eax * 4]		; ST = autoc[coeff]+d*data[sample+coeff] d
	fstp	dword [edi + eax * 4]		; autoc[coeff]+=d*data[sample+coeff]  ST = d
	inc	ebx				; (sample+coeff)++
	inc	eax				; coeff++
	dec	edx
	jnz	.inner_loop2
	pop	eax				; restore sample
	fstp	st0				; pop d, ST = empty
	inc	eax				; sample++
	loop	.outer_loop2
.outer_end:
	jmp	.end

.lag_eq_6_plus_1:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 6
	ALIGN 16
.lag_6_1_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st1, st0			; add to autoc[6]
	dec	ecx
	jnz	.lag_6_1_loop
	fstp	dword [edi + 24]
	jmp	.end

.lag_eq_6_plus_2:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[7]
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 7
	ALIGN 16
.lag_6_2_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st2, st0			; add to autoc[7]
	dec	ecx
	jnz	.lag_6_2_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	faddp	st1, st0			; add to autoc[6]
	fstp	dword [edi + 24]
	fstp	dword [edi + 28]
	jmp	.end

.lag_eq_6_plus_3:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[8]
	fldz					; will accumulate autoc[7]
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 8
	ALIGN 16
.lag_6_3_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[8]
	dec	ecx
	jnz	.lag_6_3_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st2, st0			; add to autoc[7]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	faddp	st1, st0			; add to autoc[6]
	fstp	dword [edi + 24]
	fstp	dword [edi + 28]
	fstp	dword [edi + 32]
	jmp	.end

.lag_eq_6_plus_4:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[9]
	fldz					; will accumulate autoc[8]
	fldz					; will accumulate autoc[7]
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 9
	ALIGN 16
.lag_6_4_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[9]
	dec	ecx
	jnz	.lag_6_4_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[8]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st2, st0			; add to autoc[7]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	faddp	st1, st0			; add to autoc[6]
	fstp	dword [edi + 24]
	fstp	dword [edi + 28]
	fstp	dword [edi + 32]
	fstp	dword [edi + 36]
	jmp	.end

.lag_eq_6_plus_5:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[10]
	fldz					; will accumulate autoc[9]
	fldz					; will accumulate autoc[8]
	fldz					; will accumulate autoc[7]
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 10
	ALIGN 16
.lag_6_5_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[9]
	fld	dword [esi + 40]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st5, st0			; add to autoc[10]
	dec	ecx
	jnz	.lag_6_5_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[9]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[8]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st2, st0			; add to autoc[7]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	faddp	st1, st0			; add to autoc[6]
	fstp	dword [edi + 24]
	fstp	dword [edi + 28]
	fstp	dword [edi + 32]
	fstp	dword [edi + 36]
	fstp	dword [edi + 40]
	jmp	.end

.lag_eq_6_plus_6:
	mov	ecx, [ebp + 4]			; ecx == data_len
	mov	esi, [ebp]			; esi == data
	mov	edi, [ebp + 12]			; edi == autoc
	fldz					; will accumulate autoc[11]
	fldz					; will accumulate autoc[10]
	fldz					; will accumulate autoc[9]
	fldz					; will accumulate autoc[8]
	fldz					; will accumulate autoc[7]
	fldz					; will accumulate autoc[6]
	sub	ecx, byte 11
	ALIGN 16
.lag_6_6_loop:
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[9]
	fld	dword [esi + 40]
	fmul	st0, st1
	faddp	st6, st0			; add to autoc[10]
	fld	dword [esi + 44]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st6, st0			; add to autoc[11]
	dec	ecx
	jnz	.lag_6_6_loop
	; clean up the leftovers
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmul	st0, st1
	faddp	st5, st0			; add to autoc[9]
	fld	dword [esi + 40]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st5, st0			; add to autoc[10]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmul	st0, st1
	faddp	st4, st0			; add to autoc[8]
	fld	dword [esi + 36]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st4, st0			; add to autoc[9]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmul	st0, st1
	faddp	st3, st0			; add to autoc[7]
	fld	dword [esi + 32]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st3, st0			; add to autoc[8]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmul	st0, st1
	faddp	st2, st0			; add to autoc[6]
	fld	dword [esi + 28]
	fmulp	st1, st0
	add	esi, byte 4			; [CR] sample++
	faddp	st2, st0			; add to autoc[7]
	fld	dword [esi]
	fld	dword [esi + 24]
	fmulp	st1, st0
	faddp	st1, st0			; add to autoc[6]
	fstp	dword [edi + 24]
	fstp	dword [edi + 28]
	fstp	dword [edi + 32]
	fstp	dword [edi + 36]
	fstp	dword [edi + 40]
	fstp	dword [edi + 44]
	jmp	.end

.end:
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

end