/* DCT insipired by Jeff Tsay's DCT from the maplay package this is an optimized version with manual unroll. References: [1] S. Winograd: "On Computing the Discrete Fourier Transform", Mathematics of Computation, Volume 32, Number 141, January 1978, Pages 175-199 */ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) { { real *in = inbuf; in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; in[2] +=in[1]; in[1] +=in[0]; in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; { #define MACRO0(v) { \ real tmp; \ out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ out2[8-(v)] = tmp * w[26-(v)]; } \ sum0 -= sum1; \ ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; #define MACRO1(v) { \ real sum0,sum1; \ sum0 = tmp1a + tmp2a; \ sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ MACRO0(v); } #define MACRO2(v) { \ real sum0,sum1; \ sum0 = tmp2a - tmp1a; \ sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ MACRO0(v); } const real *c = COS9; real *out2 = o2; real *w = wintab; real *out1 = o1; real *ts = tsbuf; real ta33,ta66,tb33,tb66; ta33 = in[2*3+0] * c[3]; ta66 = in[2*6+0] * c[6]; tb33 = in[2*3+1] * c[3]; tb66 = in[2*6+1] * c[6]; { real tmp1a,tmp2a,tmp1b,tmp2b; tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; MACRO1(0); MACRO2(8); } { real tmp1a,tmp2a,tmp1b,tmp2b; tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; MACRO1(1); MACRO2(7); } { real tmp1a,tmp2a,tmp1b,tmp2b; tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; MACRO1(2); MACRO2(6); } { real tmp1a,tmp2a,tmp1b,tmp2b; tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; MACRO1(3); MACRO2(5); } { real sum0,sum1; sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; MACRO0(4); } } } }