summaryrefslogtreecommitdiff
path: root/vpx_scale/win32/scaleopt.c
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_scale/win32/scaleopt.c')
-rw-r--r--vpx_scale/win32/scaleopt.c1749
1 files changed, 1749 insertions, 0 deletions
diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c
new file mode 100644
index 000000000..da0533e6b
--- /dev/null
+++ b/vpx_scale/win32/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : scaleopt.cpp
+*
+* Description : Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+* Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_3_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+
+ push ebx
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-3];
+
+ movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
+ movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_3_5_loop:
+
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ add esi, 3
+
+ add edi, 5
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ cmp esi, edx
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi-4], mm0
+ jl horiz_line_3_5_loop
+
+//Exit:
+ mov eax, DWORD PTR [esi] // eax = 00 01 02 03
+ mov ebx, eax
+
+ and ebx, 0xffff00 // ebx = xx 01 02 xx
+ mov ecx, eax // ecx = 00 01 02 03
+
+ and eax, 0xffff0000 // eax = xx xx 02 03
+ xor ecx, eax // ecx = 00 01 xx xx
+
+ shr ebx, 8 // ebx = 01 02 xx xx
+ or eax, ebx // eax = 01 02 02 03
+
+ shl eax, 8 // eax = xx 01 02 02
+ and eax, 0xffff0000 // eax = xx xx 02 02
+
+ or eax, ebx // eax = 01 02 02 02
+
+ shl ebx, 16 // ebx = xx xx 01 02
+ movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
+
+ or ebx, ecx // ebx = 00 01 01 02
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+ movd mm0, ebx // mm0 = 00 01 01 02
+ pmullw mm1, mm6 //
+
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
+ pmullw mm0, mm5 //
+
+ mov [edi], ebx // writeoutput 00 xx xx xx
+ paddw mm0, mm1
+
+ paddw mm0, mm4
+ psrlw mm0, 8
+
+ packuswb mm0, mm7
+ movd DWORD Ptr [edi+1], mm0
+
+ pop ebx
+
+ }
+
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_4_5_scale_mmx
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void)dest_width;
+
+ __asm
+ {
+
+ mov esi, source
+ mov edi, dest
+
+ mov ecx, source_width
+ lea edx, [esi+ecx-8];
+
+ movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
+ movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
+
+ movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
+ pxor mm7, mm7 // clear mm7
+
+ horiz_line_4_5_loop:
+
+ movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ add edi, 10
+
+ add esi, 8
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ cmp esi, edx
+
+ psrlw mm2, 8
+ packuswb mm2, mm7
+
+ movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+ jl horiz_line_4_5_loop
+
+//Exit:
+ movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
+ movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
+
+ movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
+ psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
+
+ movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
+ pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
+
+ psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
+ por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
+
+ movq mm3, mm1
+
+ movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
+ punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+ punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
+ pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
+
+ pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
+ punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+ movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
+ pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
+
+ punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
+ pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
+
+ paddw mm0, mm1 // added round values
+ paddw mm0, mm4
+
+ psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
+ packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
+
+ movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
+ paddw mm2, mm3 //
+
+ paddw mm2, mm4 // added round values
+ psrlw mm2, 8
+
+ packuswb mm2, mm7
+ movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
+
+
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has a "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+
+ movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
+
+ movq mm5, four_fifths // mm5 = 4/5
+ pmullw mm1, mm5 // d * 4/5
+
+ movq mm6, one_fifth // mm6 = 1/5
+ movq mm2, mm0 // make a copy
+
+ pmullw mm3, mm5 // d * 4/5
+ punpcklbw mm0, mm7 // unpack low
+
+ pmullw mm0, mm6 // an * 1/5
+ punpckhbw mm2, mm7 // unpack high
+
+ paddw mm1, mm0 // d * 4/5 + an * 1/5
+ pmullw mm2, mm6 // an * 1/5
+
+ paddw mm3, mm2 // d * 4/5 + an * 1/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_4_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : None
+ *
+ * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ last_vs_4_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, one_fifth
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 1/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 1/5
+ movq mm6, four_fifths // constan
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 4/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 4/5
+ paddw mm0, mm4 // a * 1/5 + b * 4/5
+
+ paddw mm2, mm5 // a * 1/5 + b * 4/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm5, two_fifths
+ movq mm2, mm0 // make a copy
+
+ pmullw mm1, mm5 // b * 2/5
+ movq mm6, three_fifths
+
+
+ punpcklbw mm0, mm7 // unpack low to word
+ pmullw mm3, mm5 // b * 2/5
+
+ movq mm4, mm0 // make copy of c
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm4, mm6 // c * 3/5
+ movq mm5, mm2
+
+ pmullw mm5, mm6 // c * 3/5
+ paddw mm1, mm4 // b * 2/5 + c * 3/5
+
+ paddw mm3, mm5 // b * 2/5 + c * 3/5
+ paddw mm1, round_values // + 128
+
+ paddw mm3, round_values // + 128
+ psrlw mm1, 8
+
+ psrlw mm3, 8
+ packuswb mm1, mm3 // des[2]
+
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+ movq mm1, [edi] // mm1=Src[3];
+
+ movq QWORD ptr [edi+ecx], mm1 // write des[4];
+
+ // mm0, mm2 --- Src[2]
+ // mm1 --- Src[3]
+ // mm6 --- 3/5
+ // mm7 for unpacking
+
+ pmullw mm0, mm6 // c * 3/5
+ movq mm5, two_fifths // mm5 = 2/5
+
+ movq mm3, mm1 // make a copy
+ pmullw mm2, mm6 // c * 3/5
+
+ punpcklbw mm1, mm7 // unpack low
+ movq mm4, mm1 // make a copy
+
+ punpckhbw mm3, mm7 // unpack high
+ pmullw mm4, mm5 // d * 2/5
+
+ movq mm6, mm3 // make a copy
+ pmullw mm6, mm5 // d * 2/5
+
+ paddw mm0, mm4 // c * 3/5 + d * 2/5
+ paddw mm2, mm6 // c * 3/5 + d * 2/5
+
+ paddw mm0, round_values // + 128
+ paddw mm2, round_values // + 128
+
+ psrlw mm0, 8
+ psrlw mm2, 8
+
+ packuswb mm0, mm2 // des[3]
+ movq QWORD ptr [edi], mm0 // write des[3]
+
+ // mm1, mm3 --- Src[3]
+ // mm7 -- cleared for unpacking
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_4_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ pxor mm7, mm7 // clear mm7 for unpacking
+ movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
+
+ movq mm5, three_fifths // mm5 = 3/5
+ pmullw mm0, mm5 // d * 3/5
+
+ movq mm6, two_fifths // mm6 = 2/5
+ movq mm3, mm1 // make a copy
+
+ pmullw mm2, mm5 // d * 3/5
+ punpcklbw mm1, mm7 // unpack low
+
+ pmullw mm1, mm6 // an * 2/5
+ punpckhbw mm3, mm7 // unpack high
+
+ paddw mm0, mm1 // d * 3/5 + an * 2/5
+ pmullw mm3, mm6 // an * 2/5
+
+ paddw mm2, mm3 // d * 3/5 + an * 2/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des[4]
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_3_5_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ lea edi, [esi+ecx*2] // tow lines below
+ add edi, ecx // three lines below
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+
+ last_vs_3_5_loop:
+
+ movq mm0, QWORD ptr [esi] // src[0];
+ movq mm1, QWORD ptr [esi+ecx] // src[1];
+
+ movq mm2, mm0 // Make a copy
+ punpcklbw mm0, mm7 // unpack low to word
+
+ movq mm5, two_fifths // mm5 = 2/5
+ punpckhbw mm2, mm7 // unpack high to word
+
+ pmullw mm0, mm5 // a * 2/5
+
+ movq mm3, mm1 // make a copy
+ punpcklbw mm1, mm7 // unpack low to word
+
+ pmullw mm2, mm5 // a * 2/5
+ movq mm6, three_fifths // mm6 = 3/5
+
+ movq mm4, mm1 // copy of low b
+ pmullw mm4, mm6 // b * 3/5
+
+ punpckhbw mm3, mm7 // unpack high to word
+ movq mm5, mm3 // copy of high b
+
+ pmullw mm5, mm6 // b * 3/5
+ paddw mm0, mm4 // a * 2/5 + b * 3/5
+
+ paddw mm2, mm5 // a * 2/5 + b * 3/5
+ paddw mm0, round_values // + 128
+
+ paddw mm2, round_values // + 128
+ psrlw mm0, 8
+
+ psrlw mm2, 8
+ packuswb mm0, mm2 // des [1]
+
+ movq QWORD ptr [esi+ecx], mm0 // write des[1]
+ movq mm0, [esi+ecx*2] // mm0 = src[2]
+
+
+
+ // mm1, mm3 --- Src[1]
+ // mm0 --- Src[2]
+ // mm7 for unpacking
+
+ movq mm4, mm1 // b low
+ pmullw mm1, four_fifths // b * 4/5 low
+
+ movq QWORD ptr [edi+ecx], mm0 // write des[4]
+
+ movq mm5, mm3 // b high
+ pmullw mm3, four_fifths // b * 4/5 high
+
+ movq mm2, mm0 // c
+ pmullw mm4, one_fifth // b * 1/5
+
+ punpcklbw mm0, mm7 // c low
+ pmullw mm5, one_fifth // b * 1/5
+
+ movq mm6, mm0 // make copy of c low
+ punpckhbw mm2, mm7 // c high
+
+ pmullw mm6, one_fifth // c * 1/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, one_fifth // c * 1/5 high
+ paddw mm1, mm6 // b * 4/5 + c * 1/5 low
+
+ paddw mm3, mm7 // b * 4/5 + c * 1/5 high
+ movq mm6, mm0 // make copy of c low
+
+ pmullw mm6, four_fifths // c * 4/5 low
+ movq mm7, mm2 // make copy of c high
+
+ pmullw mm7, four_fifths // c * 4/5 high
+
+ paddw mm4, mm6 // b * 1/5 + c * 4/5 low
+ paddw mm5, mm7 // b * 1/5 + c * 4/5 high
+
+ paddw mm1, round_values // + 128
+ paddw mm3, round_values // + 128
+
+ psrlw mm1, 8
+ psrlw mm3, 8
+
+ packuswb mm1, mm3 // des[2]
+ movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
+
+ paddw mm4, round_values // + 128
+ paddw mm5, round_values // + 128
+
+ psrlw mm4, 8
+ psrlw mm5, 8
+
+ packuswb mm4, mm5 // des[3]
+ movq QWORD ptr [edi], mm4 // write des[3]
+
+ // mm0, mm2 --- Src[3]
+
+ add edi, 8
+ add esi, 8
+
+ sub edx, 8
+ jg last_vs_3_5_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ pxor mm7, mm7 // clear out mm7
+ mov edx, dest_width // Loop counter
+
+ vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq mm1, [esi + ecx * 2] // get Src[1]
+
+ movq mm2, mm0 // make copy before unpack
+ movq mm3, mm1 // make copy before unpack
+
+ punpcklbw mm0, mm7 // low Src[0]
+ movq mm6, four_ones // mm6= 1, 1, 1, 1
+
+ punpcklbw mm1, mm7 // low Src[1]
+ paddw mm0, mm1 // low (a + b)
+
+ punpckhbw mm2, mm7 // high Src[0]
+ paddw mm0, mm6 // low (a + b + 1)
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3 // high (a + b )
+
+ psraw mm0, 1 // low (a + b +1 )/2
+ paddw mm2, mm6 // high (a + b + 1)
+
+ psraw mm2, 1 // high (a + b + 1)/2
+ packuswb mm0, mm2 // pack results
+
+ movq [esi+ecx], mm0 // write out eight bytes
+ add esi, 8
+
+ sub edx, 8
+ jg vs_1_2_loop
+ }
+
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : last_vertical_band_1_2_scale_mmx
+ *
+ * INPUTS : unsigned char *dest :
+ * unsigned int dest_pitch :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of band of pixels.
+ *
+ * SPECIAL NOTES : The routine uses the first line of the band below
+ * the current band. The function also has an "C" only
+ * version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+ unsigned char *dest,
+ unsigned int dest_pitch,
+ unsigned int dest_width
+)
+{
+ __asm
+ {
+ mov esi, dest // Get the source and destination pointer
+ mov ecx, dest_pitch // Get the pitch size
+
+ mov edx, dest_width // Loop counter
+
+ last_vs_1_2_loop:
+
+ movq mm0, [esi] // get Src[0]
+ movq [esi+ecx], mm0 // write out eight bytes
+
+ add esi, 8
+ sub edx, 8
+
+ jg last_vs_1_2_loop
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_1_2_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ movq mm6, four_ones
+
+ mov ecx, source_width
+
+ hs_1_2_loop:
+
+ movq mm0, [esi]
+ movq mm1, [esi+1]
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ movq mm4, mm0
+ punpcklbw mm0, mm7
+
+ punpcklbw mm1, mm7
+ paddw mm0, mm1
+
+ paddw mm0, mm6
+ punpckhbw mm2, mm7
+
+ punpckhbw mm3, mm7
+ paddw mm2, mm3
+
+ paddw mm2, mm6
+ psraw mm0, 1
+
+ psraw mm2, 1
+ packuswb mm0, mm2
+
+ movq mm2, mm4
+ punpcklbw mm2, mm0
+
+ movq [edi], mm2
+ punpckhbw mm4, mm0
+
+ movq [edi+8], mm4
+ add esi, 8
+
+ add edi, 16
+ sub ecx, 8
+
+ cmp ecx, 8
+ jg hs_1_2_loop
+
+// last eight pixel
+
+ movq mm0, [esi]
+ movq mm1, mm0
+
+ movq mm2, mm0
+ movq mm3, mm1
+
+ psrlq mm1, 8
+ psrlq mm3, 56
+
+ psllq mm3, 56
+ por mm1, mm3
+
+ movq mm3, mm1
+ movq mm4, mm0
+
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+
+ paddw mm0, mm1
+ paddw mm0, mm6
+
+ punpckhbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ paddw mm2, mm3
+ paddw mm2, mm6
+
+ psraw mm0, 1
+ psraw mm2, 1
+
+ packuswb mm0, mm2
+ movq mm2, mm4
+
+ punpcklbw mm2, mm0
+ movq [edi], mm2
+
+ punpckhbw mm4, mm0
+ movq [edi+8], mm4
+ }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_5_4_scale_mmx
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ /*
+ unsigned i;
+ unsigned int a, b, c, d, e;
+ unsigned char *des = dest;
+ const unsigned char *src = source;
+
+ (void) dest_width;
+
+ for ( i=0; i<source_width; i+=5 )
+ {
+ a = src[0];
+ b = src[1];
+ c = src[2];
+ d = src[3];
+ e = src[4];
+
+ des[0] = a;
+ des[1] = ((b*192 + c* 64 + 128)>>8);
+ des[2] = ((c*128 + d*128 + 128)>>8);
+ des[3] = ((d* 64 + e*192 + 128)>>8);
+
+ src += 5;
+ des += 4;
+ }
+ */
+ (void) dest_width;
+
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const54_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const54_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx] ;
+ horizontal_line_5_4_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psrlq mm0, 8 ;
+ 01 02 03 04 05 06 07 xx
+ punpcklbw mm1, mm7 ;
+ xx 00 xx 01 xx 02 xx 03
+
+ punpcklbw mm0, mm7 ;
+ xx 01 xx 02 xx 03 xx 04
+ pmullw mm1, mm5
+
+ pmullw mm0, mm6
+ add esi, 5
+
+ add edi, 4
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-4], mm1
+
+ jl horizontal_line_5_4_loop
+
+ }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
+__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ mov ebx, dest_width
+
+ vs_5_4_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ movq mm3, mm2
+ pmullw mm1, three_fourths
+
+ pmullw mm2, one_fourths
+ movd mm4, [eax+ecx]
+
+ pmullw mm3, two_fourths
+ punpcklbw mm4, mm7
+
+ movq mm5, mm4
+ pmullw mm4, two_fourths
+
+ paddw mm1, mm2
+ movd mm6, [eax+ecx*2]
+
+ pmullw mm5, one_fourths
+ paddw mm1, round_values;
+
+ paddw mm3, mm4
+ psrlw mm1, 8
+
+ punpcklbw mm6, mm7
+ paddw mm3, round_values
+
+ pmullw mm6, three_fourths
+ psrlw mm3, 8
+
+ packuswb mm1, mm7
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi], mm0
+ movd DWORD PTR [edi+edx], mm1
+
+
+ paddw mm5, mm6
+ movd DWORD PTR [edi+edx*2], mm3
+
+ lea eax, [edi+edx*2]
+ paddw mm5, round_values
+
+ psrlw mm5, 8
+ add edi, 4
+
+ packuswb mm5, mm7
+ movd DWORD PTR [eax+edx], mm5
+
+ add esi, 4
+ sub ebx, 4
+
+ jg vs_5_4_loop
+
+ pop ebx
+ }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+
+ (void) dest_width;
+ __asm
+ {
+
+ mov esi, source ;
+ mov edi, dest ;
+
+ mov ecx, source_width ;
+ movq mm5, const53_1 ;
+
+ pxor mm7, mm7 ;
+ movq mm6, const53_2 ;
+
+ movq mm4, round_values ;
+ lea edx, [esi+ecx-5] ;
+ horizontal_line_5_3_loop:
+
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ add esi, 5
+
+ add edi, 3
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ cmp esi, edx
+ packuswb mm1, mm7
+
+ movd DWORD PTR [edi-3], mm1
+ jl horizontal_line_5_3_loop
+
+//exit condition
+ movq mm0, QWORD PTR [esi] ;
+ 00 01 02 03 04 05 06 07
+ movq mm1, mm0 ;
+ 00 01 02 03 04 05 06 07
+
+ psllw mm0, 8 ;
+ xx 00 xx 02 xx 04 xx 06
+ psrlw mm1, 8 ;
+ 01 xx 03 xx 05 xx 07 xx
+
+ psrlw mm0, 8 ;
+ 00 xx 02 xx 04 xx 06 xx
+ psllq mm1, 16 ;
+ xx xx 01 xx 03 xx 05 xx
+
+ pmullw mm0, mm6
+
+ pmullw mm1, mm5
+ paddw mm1, mm0
+
+ paddw mm1, mm4
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ movd eax, mm1
+
+ mov edx, eax
+ shr edx, 16
+
+ mov WORD PTR[edi], ax
+ mov BYTE PTR[edi+2], dl
+
+ }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ __asm
+ {
+ push ebx
+
+ mov esi, source // Get the source and destination pointer
+ mov ecx, src_pitch // Get the pitch size
+
+ mov edi, dest // tow lines below
+ pxor mm7, mm7 // clear out mm7
+
+ mov edx, dest_pitch // Loop counter
+ movq mm5, one_thirds
+
+ movq mm6, two_thirds
+ mov ebx, dest_width;
+
+ vs_5_3_loop:
+
+ movd mm0, DWORD ptr [esi] // src[0];
+ movd mm1, DWORD ptr [esi+ecx] // src[1];
+
+ movd mm2, DWORD ptr [esi+ecx*2]
+ lea eax, [esi+ecx*2] //
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ pmullw mm1, mm5
+ pmullw mm2, mm6
+
+ movd mm3, DWORD ptr [eax+ecx]
+ movd mm4, DWORD ptr [eax+ecx*2]
+
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+
+ pmullw mm3, mm6
+ pmullw mm4, mm5
+
+
+ movd DWORD PTR [edi], mm0
+ paddw mm1, mm2
+
+ paddw mm1, round_values
+ psrlw mm1, 8
+
+ packuswb mm1, mm7
+ paddw mm3, mm4
+
+ paddw mm3, round_values
+ movd DWORD PTR [edi+edx], mm1
+
+ psrlw mm3, 8
+ packuswb mm3, mm7
+
+ movd DWORD PTR [edi+edx*2], mm3
+
+
+ add edi, 4
+ add esi, 4
+
+ sub ebx, 4
+ jg vs_5_3_loop
+
+ pop ebx
+ }
+}
+
+
+
+
+/****************************************************************************
+ *
+ * ROUTINE : horizontal_line_2_1_scale
+ *
+ * INPUTS : const unsigned char *source :
+ * unsigned int source_width :
+ * unsigned char *dest :
+ * unsigned int dest_width :
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+ const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width
+)
+{
+ (void) dest_width;
+ (void) source_width;
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ pxor mm7, mm7
+ mov ecx, dest_width
+
+ xor edx, edx
+ hs_2_1_loop:
+
+ movq mm0, [esi+edx*2]
+ psllw mm0, 8
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD Ptr [edi+edx], mm0;
+ add edx, 4
+
+ cmp edx, ecx
+ jl hs_2_1_loop
+
+ }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+ (void) dest_pitch;
+ (void) src_pitch;
+ vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+ (void) dest_pitch;
+ __asm
+ {
+ mov esi, source
+ mov edi, dest
+
+ mov eax, src_pitch
+ mov edx, dest_width
+
+ pxor mm7, mm7
+ sub esi, eax //back one line
+
+
+ lea ecx, [esi+edx];
+ movq mm6, round_values;
+
+ movq mm5, three_sixteenths;
+ movq mm4, ten_sixteenths;
+
+ vs_2_1_i_loop:
+ movd mm0, [esi] //
+ movd mm1, [esi+eax] //
+
+ movd mm2, [esi+eax*2] //
+ punpcklbw mm0, mm7
+
+ pmullw mm0, mm5
+ punpcklbw mm1, mm7
+
+ pmullw mm1, mm4
+ punpcklbw mm2, mm7
+
+ pmullw mm2, mm5
+ paddw mm0, round_values
+
+ paddw mm1, mm2
+ paddw mm0, mm1
+
+ psrlw mm0, 8
+ packuswb mm0, mm7
+
+ movd DWORD PTR [edi], mm0
+ add esi, 4
+
+ add edi, 4;
+ cmp esi, ecx
+ jl vs_2_1_i_loop
+
+ }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+ vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
+ vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
+ vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
+ vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
+ vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
+ vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
+ vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
+ vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
+ vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
+
+ vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
+ vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
+ vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
+ vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
+ vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
+ vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+ vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
+ vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
+ vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
+ vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
+ vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
+ vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
+ vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
+
+
+
+
+}