diff options
Diffstat (limited to 'vpx_scale/win32/scaleopt.c')
-rw-r--r-- | vpx_scale/win32/scaleopt.c | 1749 |
1 files changed, 1749 insertions, 0 deletions
diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c new file mode 100644 index 000000000..da0533e6b --- /dev/null +++ b/vpx_scale/win32/scaleopt.c @@ -0,0 +1,1749 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : scaleopt.cpp +* +* Description : Optimized scaling functions +* +****************************************************************************/ +#include "pragmas.h" + + + +/**************************************************************************** +* Module Statics +****************************************************************************/ +__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; +__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; +__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; +__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; +__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; +__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; +__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; +__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; +__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; +__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; + + + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** + * + * ROUTINE : horizontal_line_3_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_3_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + + push ebx + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-3]; + + movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx + movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_3_5_loop: + + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + add esi, 3 + + add edi, 5 + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + cmp esi, edx + packuswb mm0, mm7 + + movd DWORD Ptr [edi-4], mm0 + jl horiz_line_3_5_loop + +//Exit: + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl eax, 8 // eax = xx 01 02 02 + and eax, 0xffff0000 // eax = xx xx 02 02 + + or eax, ebx // eax = 01 02 02 02 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + packuswb mm0, mm7 + movd DWORD Ptr [edi+1], mm0 + + pop ebx + + } + +} + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_4_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_4_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void)dest_width; + + __asm + { + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-8]; + + movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx + movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_4_5_loop: + + movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + add edi, 10 + + add esi, 8 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + cmp esi, edx + + psrlw mm2, 8 + packuswb mm2, mm7 + + movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 + jl horiz_line_4_5_loop + +//Exit: + movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 + + movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 + pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 + + psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 + por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 + + movq mm3, mm1 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + psrlw mm2, 8 + + packuswb mm2, mm7 + movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 + + + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has a "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + + movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group + + movq mm5, four_fifths // mm5 = 4/5 + pmullw mm1, mm5 // d * 4/5 + + movq mm6, one_fifth // mm6 = 1/5 + movq mm2, mm0 // make a copy + + pmullw mm3, mm5 // d * 4/5 + punpcklbw mm0, mm7 // unpack low + + pmullw mm0, mm6 // an * 1/5 + punpckhbw mm2, mm7 // unpack high + + paddw mm1, mm0 // d * 4/5 + an * 1/5 + pmullw mm2, mm6 // an * 1/5 + + paddw mm3, mm2 // d * 4/5 + an * 1/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[4] + + movq QWORD ptr [edi+ecx], mm1 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + last_vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + movq QWORD ptr [edi+ecx], mm1 // write des[4]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + pxor mm7, mm7 // clear mm7 for unpacking + movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group + + movq mm5, three_fifths // mm5 = 3/5 + pmullw mm0, mm5 // d * 3/5 + + movq mm6, two_fifths // mm6 = 2/5 + movq mm3, mm1 // make a copy + + pmullw mm2, mm5 // d * 3/5 + punpcklbw mm1, mm7 // unpack low + + pmullw mm1, mm6 // an * 2/5 + punpckhbw mm3, mm7 // unpack high + + paddw mm0, mm1 // d * 3/5 + an * 2/5 + pmullw mm3, mm6 // an * 2/5 + + paddw mm2, mm3 // d * 3/5 + an * 2/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des[4] + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + + last_vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq mm1, [esi + ecx * 2] // get Src[1] + + movq mm2, mm0 // make copy before unpack + movq mm3, mm1 // make copy before unpack + + punpcklbw mm0, mm7 // low Src[0] + movq mm6, four_ones // mm6= 1, 1, 1, 1 + + punpcklbw mm1, mm7 // low Src[1] + paddw mm0, mm1 // low (a + b) + + punpckhbw mm2, mm7 // high Src[0] + paddw mm0, mm6 // low (a + b + 1) + + punpckhbw mm3, mm7 + paddw mm2, mm3 // high (a + b ) + + psraw mm0, 1 // low (a + b +1 )/2 + paddw mm2, mm6 // high (a + b + 1) + + psraw mm2, 1 // high (a + b + 1)/2 + packuswb mm0, mm2 // pack results + + movq [esi+ecx], mm0 // write out eight bytes + add esi, 8 + + sub edx, 8 + jg vs_1_2_loop + } + +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + mov edx, dest_width // Loop counter + + last_vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq [esi+ecx], mm0 // write out eight bytes + + add esi, 8 + sub edx, 8 + + jg last_vs_1_2_loop + } +} + +/**************************************************************************** + * + * ROUTINE : horizontal_line_1_2_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_1_2_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + movq mm6, four_ones + + mov ecx, source_width + + hs_1_2_loop: + + movq mm0, [esi] + movq mm1, [esi+1] + + movq mm2, mm0 + movq mm3, mm1 + + movq mm4, mm0 + punpcklbw mm0, mm7 + + punpcklbw mm1, mm7 + paddw mm0, mm1 + + paddw mm0, mm6 + punpckhbw mm2, mm7 + + punpckhbw mm3, mm7 + paddw mm2, mm3 + + paddw mm2, mm6 + psraw mm0, 1 + + psraw mm2, 1 + packuswb mm0, mm2 + + movq mm2, mm4 + punpcklbw mm2, mm0 + + movq [edi], mm2 + punpckhbw mm4, mm0 + + movq [edi+8], mm4 + add esi, 8 + + add edi, 16 + sub ecx, 8 + + cmp ecx, 8 + jg hs_1_2_loop + +// last eight pixel + + movq mm0, [esi] + movq mm1, mm0 + + movq mm2, mm0 + movq mm3, mm1 + + psrlq mm1, 8 + psrlq mm3, 56 + + psllq mm3, 56 + por mm1, mm3 + + movq mm3, mm1 + movq mm4, mm0 + + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + + paddw mm0, mm1 + paddw mm0, mm6 + + punpckhbw mm2, mm7 + punpckhbw mm3, mm7 + + paddw mm2, mm3 + paddw mm2, mm6 + + psraw mm0, 1 + psraw mm2, 1 + + packuswb mm0, mm2 + movq mm2, mm4 + + punpcklbw mm2, mm0 + movq [edi], mm2 + + punpckhbw mm4, mm0 + movq [edi+8], mm4 + } +} + + + + + +__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; +__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_5_4_scale_mmx + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_5_4_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + /* + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for ( i=0; i<source_width; i+=5 ) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = a; + des[1] = ((b*192 + c* 64 + 128)>>8); + des[2] = ((c*128 + d*128 + 128)>>8); + des[3] = ((d* 64 + e*192 + 128)>>8); + + src += 5; + des += 4; + } + */ + (void) dest_width; + + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const54_1 ; + + pxor mm7, mm7 ; + movq mm6, const54_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx] ; + horizontal_line_5_4_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psrlq mm0, 8 ; + 01 02 03 04 05 06 07 xx + punpcklbw mm1, mm7 ; + xx 00 xx 01 xx 02 xx 03 + + punpcklbw mm0, mm7 ; + xx 01 xx 02 xx 03 xx 04 + pmullw mm1, mm5 + + pmullw mm0, mm6 + add esi, 5 + + add edi, 4 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-4], mm1 + + jl horizontal_line_5_4_loop + + } + +} +__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; +__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; + +static +void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + mov ebx, dest_width + + vs_5_4_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + movq mm3, mm2 + pmullw mm1, three_fourths + + pmullw mm2, one_fourths + movd mm4, [eax+ecx] + + pmullw mm3, two_fourths + punpcklbw mm4, mm7 + + movq mm5, mm4 + pmullw mm4, two_fourths + + paddw mm1, mm2 + movd mm6, [eax+ecx*2] + + pmullw mm5, one_fourths + paddw mm1, round_values; + + paddw mm3, mm4 + psrlw mm1, 8 + + punpcklbw mm6, mm7 + paddw mm3, round_values + + pmullw mm6, three_fourths + psrlw mm3, 8 + + packuswb mm1, mm7 + packuswb mm3, mm7 + + movd DWORD PTR [edi], mm0 + movd DWORD PTR [edi+edx], mm1 + + + paddw mm5, mm6 + movd DWORD PTR [edi+edx*2], mm3 + + lea eax, [edi+edx*2] + paddw mm5, round_values + + psrlw mm5, 8 + add edi, 4 + + packuswb mm5, mm7 + movd DWORD PTR [eax+edx], mm5 + + add esi, 4 + sub ebx, 4 + + jg vs_5_4_loop + + pop ebx + } +} + + +__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; +__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; + + +static +void horizontal_line_5_3_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + + (void) dest_width; + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const53_1 ; + + pxor mm7, mm7 ; + movq mm6, const53_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx-5] ; + horizontal_line_5_3_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + add esi, 5 + + add edi, 3 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-3], mm1 + jl horizontal_line_5_3_loop + +//exit condition + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + packuswb mm1, mm7 + movd eax, mm1 + + mov edx, eax + shr edx, 16 + + mov WORD PTR[edi], ax + mov BYTE PTR[edi+2], dl + + } + +} + +__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; +__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; + +static +void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + movq mm5, one_thirds + + movq mm6, two_thirds + mov ebx, dest_width; + + vs_5_3_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + pmullw mm1, mm5 + pmullw mm2, mm6 + + movd mm3, DWORD ptr [eax+ecx] + movd mm4, DWORD ptr [eax+ecx*2] + + punpcklbw mm3, mm7 + punpcklbw mm4, mm7 + + pmullw mm3, mm6 + pmullw mm4, mm5 + + + movd DWORD PTR [edi], mm0 + paddw mm1, mm2 + + paddw mm1, round_values + psrlw mm1, 8 + + packuswb mm1, mm7 + paddw mm3, mm4 + + paddw mm3, round_values + movd DWORD PTR [edi+edx], mm1 + + psrlw mm3, 8 + packuswb mm3, mm7 + + movd DWORD PTR [edi+edx*2], mm3 + + + add edi, 4 + add esi, 4 + + sub ebx, 4 + jg vs_5_3_loop + + pop ebx + } +} + + + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_2_1_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_2_1_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + (void) source_width; + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + mov ecx, dest_width + + xor edx, edx + hs_2_1_loop: + + movq mm0, [esi+edx*2] + psllw mm0, 8 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD Ptr [edi+edx], mm0; + add edx, 4 + + cmp edx, ecx + jl hs_2_1_loop + + } +} + + + +static +void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + (void) dest_pitch; + (void) src_pitch; + vpx_memcpy(dest, source, dest_width); +} + + +__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; +__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; + +static +void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + (void) dest_pitch; + __asm + { + mov esi, source + mov edi, dest + + mov eax, src_pitch + mov edx, dest_width + + pxor mm7, mm7 + sub esi, eax //back one line + + + lea ecx, [esi+edx]; + movq mm6, round_values; + + movq mm5, three_sixteenths; + movq mm4, ten_sixteenths; + + vs_2_1_i_loop: + movd mm0, [esi] // + movd mm1, [esi+eax] // + + movd mm2, [esi+eax*2] // + punpcklbw mm0, mm7 + + pmullw mm0, mm5 + punpcklbw mm1, mm7 + + pmullw mm1, mm4 + punpcklbw mm2, mm7 + + pmullw mm2, mm5 + paddw mm0, round_values + + paddw mm1, mm2 + paddw mm0, mm1 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD PTR [edi], mm0 + add esi, 4 + + add edi, 4; + cmp esi, ecx + jl vs_2_1_i_loop + + } +} + + + +void +register_mmxscalers(void) +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; + vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; + vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; + vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; + + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + + + + vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; + vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; + vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; + vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; + vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; + vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; + vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; + + + + +} |