1 files changed, 1749 insertions, 0 deletions
diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c
new file mode 100644
index 000000000..da0533e6b
--- /dev/null
+++ b/vpx_scale/win32/scaleopt.c
@@ -0,0 +1,1749 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     scaleopt.cpp
+*
+*   Description  :     Optimized scaling functions
+*
+****************************************************************************/
+#include "pragmas.h"
+
+
+
+/****************************************************************************
+*  Module Statics
+****************************************************************************/
+__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
+__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
+__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
+__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
+__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
+__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
+__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
+__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
+__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
+__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
+
+
+
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_3_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_3_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+
+        push ebx
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-3];
+
+        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
+        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_3_5_loop:
+
+        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov        ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        add         esi,    3
+
+        add         edi,    5
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        cmp         esi,    edx
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi-4], mm0
+        jl          horiz_line_3_5_loop
+
+//Exit:
+        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
+        mov         ebx,    eax
+
+        and         ebx,    0xffff00        // ebx = xx 01 02 xx
+        mov         ecx,    eax             // ecx = 00 01 02 03
+
+        and         eax,    0xffff0000      // eax = xx xx 02 03
+        xor         ecx,    eax             // ecx = 00 01 xx xx
+
+        shr         ebx,    8               // ebx = 01 02 xx xx
+        or          eax,    ebx             // eax = 01 02 02 03
+
+        shl         eax,    8               // eax = xx 01 02 02
+        and         eax,    0xffff0000      // eax = xx xx 02 02
+
+        or          eax,    ebx             // eax = 01 02 02 02
+
+        shl         ebx,    16              // ebx = xx xx 01 02
+        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
+
+        or          ebx,    ecx             // ebx = 00 01 01 02
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
+
+        movd        mm0,    ebx             // mm0 = 00 01 01 02
+        pmullw      mm1,    mm6             //
+
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
+        pmullw      mm0,    mm5             //
+
+        mov         [edi],  ebx             // writeoutput 00 xx xx xx
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm4
+        psrlw       mm0,    8
+
+        packuswb    mm0,    mm7
+        movd        DWORD Ptr [edi+1], mm0
+
+        pop ebx
+
+    }
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_4_5_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_4_5_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void)dest_width;
+
+    __asm
+    {
+
+        mov         esi,    source
+        mov         edi,    dest
+
+        mov         ecx,    source_width
+        lea         edx,    [esi+ecx-8];
+
+        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
+        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
+
+        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
+        pxor        mm7,    mm7             // clear mm7
+
+        horiz_line_4_5_loop:
+
+        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
+
+        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        add         edi,    10
+
+        add         esi,    8
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        cmp         esi,    edx
+
+        psrlw       mm2,    8
+        packuswb    mm2,    mm7
+
+        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
+        jl         horiz_line_4_5_loop
+
+//Exit:
+        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
+        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
+
+        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
+        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
+
+        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
+        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
+
+        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
+        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
+
+        movq        mm3,    mm1
+
+        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
+        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
+
+        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
+        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
+
+        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
+        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
+
+        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
+        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
+
+        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
+        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
+
+        paddw       mm0,    mm1             // added round values
+        paddw       mm0,    mm4
+
+        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
+        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
+
+        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
+        paddw       mm2,    mm3             //
+
+        paddw       mm2,    mm4             // added round values
+        psrlw       mm2,    8
+
+        packuswb    mm2,    mm7
+        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
+
+
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has a "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+
+        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
+
+        movq        mm5,    four_fifths              // mm5 = 4/5
+        pmullw      mm1,    mm5                     // d * 4/5
+
+        movq        mm6,    one_fifth                // mm6 = 1/5
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm3,    mm5                     // d * 4/5
+        punpcklbw   mm0,    mm7                     // unpack low
+
+        pmullw      mm0,    mm6                     // an * 1/5
+        punpckhbw   mm2,    mm7                     // unpack high
+
+        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
+        pmullw      mm2,    mm6                     // an * 1/5
+
+        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg         vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_4_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_4_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_4_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    one_fifth
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 1/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 1/5
+        movq        mm6,    four_fifths               // constan
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 4/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 4/5
+        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
+
+        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm5,    two_fifths
+        movq        mm2,    mm0                     // make a copy
+
+        pmullw      mm1,    mm5                     // b * 2/5
+        movq        mm6,    three_fifths
+
+
+        punpcklbw   mm0,    mm7                     // unpack low to word
+        pmullw      mm3,    mm5                     // b * 2/5
+
+        movq        mm4,    mm0                     // make copy of c
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm4,    mm6                     // c * 3/5
+        movq        mm5,    mm2
+
+        pmullw      mm5,    mm6                     // c * 3/5
+        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
+
+        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
+        paddw       mm1,    round_values             // + 128
+
+        paddw       mm3,    round_values             // + 128
+        psrlw       mm1,    8
+
+        psrlw       mm3,    8
+        packuswb    mm1,    mm3                     // des[2]
+
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+        movq        mm1,    [edi]                   // mm1=Src[3];
+
+        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
+
+        // mm0, mm2 --- Src[2]
+        // mm1 --- Src[3]
+        // mm6 --- 3/5
+        // mm7 for unpacking
+
+        pmullw      mm0,    mm6                     // c * 3/5
+        movq        mm5,    two_fifths               // mm5 = 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        pmullw      mm2,    mm6                     // c * 3/5
+
+        punpcklbw   mm1,    mm7                     // unpack low
+        movq        mm4,    mm1                     // make a copy
+
+        punpckhbw   mm3,    mm7                     // unpack high
+        pmullw      mm4,    mm5                     // d * 2/5
+
+        movq        mm6,    mm3                     // make a copy
+        pmullw      mm6,    mm5                     // d * 2/5
+
+        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
+        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
+
+        paddw       mm0,    round_values             // + 128
+        paddw       mm2,    round_values             // + 128
+
+        psrlw       mm0,    8
+        psrlw       mm2,    8
+
+        packuswb    mm0,    mm2                     // des[3]
+        movq        QWORD ptr [edi], mm0            // write des[3]
+
+        //  mm1, mm3 --- Src[3]
+        //  mm7 -- cleared for unpacking
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_4_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        pxor        mm7,    mm7                     // clear mm7 for unpacking
+        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
+
+        movq        mm5,    three_fifths             // mm5 = 3/5
+        pmullw      mm0,    mm5                     // d * 3/5
+
+        movq        mm6,    two_fifths                // mm6 = 2/5
+        movq        mm3,    mm1                     // make a copy
+
+        pmullw      mm2,    mm5                     // d * 3/5
+        punpcklbw   mm1,    mm7                     // unpack low
+
+        pmullw      mm1,    mm6                     // an * 2/5
+        punpckhbw   mm3,    mm7                     // unpack high
+
+        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
+        pmullw      mm3,    mm6                     // an * 2/5
+
+        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des[4]
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_3_5_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_3_5_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        lea         edi,    [esi+ecx*2]             // tow lines below
+        add         edi,    ecx                     // three lines below
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+
+        last_vs_3_5_loop:
+
+        movq        mm0,    QWORD ptr [esi]         // src[0];
+        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
+
+        movq        mm2,    mm0                     // Make a copy
+        punpcklbw   mm0,    mm7                     // unpack low to word
+
+        movq        mm5,    two_fifths               // mm5 = 2/5
+        punpckhbw   mm2,    mm7                     // unpack high to word
+
+        pmullw      mm0,    mm5                     // a * 2/5
+
+        movq        mm3,    mm1                     // make a copy
+        punpcklbw   mm1,    mm7                     // unpack low to word
+
+        pmullw      mm2,    mm5                     // a * 2/5
+        movq        mm6,    three_fifths             // mm6 = 3/5
+
+        movq        mm4,    mm1                     // copy of low b
+        pmullw      mm4,    mm6                     // b * 3/5
+
+        punpckhbw   mm3,    mm7                     // unpack high to word
+        movq        mm5,    mm3                     // copy of high b
+
+        pmullw      mm5,    mm6                     // b * 3/5
+        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
+
+        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
+        paddw       mm0,    round_values             // + 128
+
+        paddw       mm2,    round_values             // + 128
+        psrlw       mm0,    8
+
+        psrlw       mm2,    8
+        packuswb    mm0,    mm2                     // des [1]
+
+        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
+        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
+
+
+
+        // mm1, mm3 --- Src[1]
+        // mm0 --- Src[2]
+        // mm7 for unpacking
+
+        movq        mm4,    mm1                     // b low
+        pmullw      mm1,    four_fifths              // b * 4/5 low
+
+        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
+
+        movq        mm5,    mm3                     // b high
+        pmullw      mm3,    four_fifths              // b * 4/5 high
+
+        movq        mm2,    mm0                     // c
+        pmullw      mm4,    one_fifth                // b * 1/5
+
+        punpcklbw   mm0,    mm7                     // c low
+        pmullw      mm5,    one_fifth                // b * 1/5
+
+        movq        mm6,    mm0                     // make copy of c low
+        punpckhbw   mm2,    mm7                     // c high
+
+        pmullw      mm6,    one_fifth                // c * 1/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    one_fifth                // c * 1/5 high
+        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
+
+        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
+        movq        mm6,    mm0                     // make copy of c low
+
+        pmullw      mm6,    four_fifths              // c * 4/5 low
+        movq        mm7,    mm2                     // make copy of c high
+
+        pmullw      mm7,    four_fifths              // c * 4/5 high
+
+        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
+        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
+
+        paddw       mm1,    round_values             // + 128
+        paddw       mm3,    round_values             // + 128
+
+        psrlw       mm1,    8
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm3                     // des[2]
+        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
+
+        paddw       mm4,    round_values             // + 128
+        paddw       mm5,    round_values             // + 128
+
+        psrlw       mm4,    8
+        psrlw       mm5,    8
+
+        packuswb    mm4,    mm5                     // des[3]
+        movq        QWORD ptr [edi], mm4            // write des[3]
+
+        //  mm0, mm2 --- Src[3]
+
+        add         edi,    8
+        add         esi,    8
+
+        sub         edx,    8
+        jg          last_vs_3_5_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        pxor        mm7,    mm7                     // clear out mm7
+        mov         edx,    dest_width               // Loop counter
+
+        vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        mm1,    [esi + ecx * 2]         // get Src[1]
+
+        movq        mm2,    mm0                     // make copy before unpack
+        movq        mm3,    mm1                     // make copy before unpack
+
+        punpcklbw   mm0,    mm7                     // low Src[0]
+        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
+
+        punpcklbw   mm1,    mm7                     // low Src[1]
+        paddw       mm0,    mm1                     // low (a + b)
+
+        punpckhbw   mm2,    mm7                     // high Src[0]
+        paddw       mm0,    mm6                     // low (a + b + 1)
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3                     // high (a + b )
+
+        psraw       mm0,    1                       // low (a + b +1 )/2
+        paddw       mm2,    mm6                     // high (a + b + 1)
+
+        psraw       mm2,    1                       // high (a + b + 1)/2
+        packuswb    mm0,    mm2                     // pack results
+
+        movq        [esi+ecx], mm0                  // write out eight bytes
+        add         esi,    8
+
+        sub         edx,    8
+        jg          vs_1_2_loop
+    }
+
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : last_vertical_band_1_2_scale_mmx
+ *
+ *  INPUTS        : unsigned char *dest    :
+ *                  unsigned int dest_pitch :
+ *                  unsigned int dest_width :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
+ *
+ *  SPECIAL NOTES : The routine uses the first line of the band below
+ *                  the current band. The function also has an "C" only
+ *                  version.
+ *
+ ****************************************************************************/
+static
+void last_vertical_band_1_2_scale_mmx
+(
+    unsigned char *dest,
+    unsigned int dest_pitch,
+    unsigned int dest_width
+)
+{
+    __asm
+    {
+        mov         esi,    dest                    // Get the source and destination pointer
+        mov         ecx,    dest_pitch               // Get the pitch size
+
+        mov         edx,    dest_width               // Loop counter
+
+        last_vs_1_2_loop:
+
+        movq        mm0,    [esi]                   // get Src[0]
+        movq        [esi+ecx], mm0                  // write out eight bytes
+
+        add         esi,    8
+        sub         edx,    8
+
+        jg         last_vs_1_2_loop
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_1_2_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_1_2_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        movq        mm6,    four_ones
+
+        mov         ecx,    source_width
+
+        hs_1_2_loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [esi+1]
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        movq        mm4,    mm0
+        punpcklbw   mm0,    mm7
+
+        punpcklbw   mm1,    mm7
+        paddw       mm0,    mm1
+
+        paddw       mm0,    mm6
+        punpckhbw   mm2,    mm7
+
+        punpckhbw   mm3,    mm7
+        paddw       mm2,    mm3
+
+        paddw       mm2,    mm6
+        psraw       mm0,    1
+
+        psraw       mm2,    1
+        packuswb    mm0,    mm2
+
+        movq        mm2,    mm4
+        punpcklbw   mm2,    mm0
+
+        movq        [edi],  mm2
+        punpckhbw   mm4,    mm0
+
+        movq        [edi+8], mm4
+        add         esi,    8
+
+        add         edi,    16
+        sub         ecx,    8
+
+        cmp         ecx,    8
+        jg          hs_1_2_loop
+
+// last eight pixel
+
+        movq        mm0,    [esi]
+        movq        mm1,    mm0
+
+        movq        mm2,    mm0
+        movq        mm3,    mm1
+
+        psrlq       mm1,    8
+        psrlq       mm3,    56
+
+        psllq       mm3,    56
+        por         mm1,    mm3
+
+        movq        mm3,    mm1
+        movq        mm4,    mm0
+
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+
+        paddw       mm0,    mm1
+        paddw       mm0,    mm6
+
+        punpckhbw   mm2,    mm7
+        punpckhbw   mm3,    mm7
+
+        paddw       mm2,    mm3
+        paddw       mm2,    mm6
+
+        psraw       mm0,    1
+        psraw       mm2,    1
+
+        packuswb    mm0,    mm2
+        movq        mm2,    mm4
+
+        punpcklbw   mm2,    mm0
+        movq        [edi],  mm2
+
+        punpckhbw   mm4,    mm0
+        movq        [edi+8], mm4
+    }
+}
+
+
+
+
+
+__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
+__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_5_4_scale_mmx
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_5_4_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    /*
+    unsigned i;
+    unsigned int a, b, c, d, e;
+    unsigned char *des = dest;
+    const unsigned char *src = source;
+
+    (void) dest_width;
+
+    for ( i=0; i<source_width; i+=5 )
+    {
+        a = src[0];
+        b = src[1];
+        c = src[2];
+        d = src[3];
+        e = src[4];
+
+        des[0] = a;
+        des[1] = ((b*192 + c* 64 + 128)>>8);
+        des[2] = ((c*128 + d*128 + 128)>>8);
+        des[3] = ((d* 64 + e*192 + 128)>>8);
+
+        src += 5;
+        des += 4;
+    }
+    */
+    (void) dest_width;
+
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const54_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const54_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx]           ;
+        horizontal_line_5_4_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psrlq       mm0,        8                   ;
+        01 02 03 04 05 06 07 xx
+        punpcklbw   mm1,        mm7                 ;
+        xx 00 xx 01 xx 02 xx 03
+
+        punpcklbw   mm0,        mm7                 ;
+        xx 01 xx 02 xx 03 xx 04
+        pmullw      mm1,        mm5
+
+        pmullw      mm0,        mm6
+        add         esi,        5
+
+        add         edi,        4
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-4], mm1
+
+        jl          horizontal_line_5_4_loop
+
+    }
+
+}
+__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
+__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
+__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
+
+static
+void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        mov         ebx,    dest_width
+
+        vs_5_4_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        movq        mm3,    mm2
+        pmullw      mm1,    three_fourths
+
+        pmullw      mm2,    one_fourths
+        movd        mm4,    [eax+ecx]
+
+        pmullw      mm3,    two_fourths
+        punpcklbw   mm4,    mm7
+
+        movq        mm5,    mm4
+        pmullw      mm4,    two_fourths
+
+        paddw       mm1,    mm2
+        movd        mm6,    [eax+ecx*2]
+
+        pmullw      mm5,    one_fourths
+        paddw       mm1,    round_values;
+
+        paddw       mm3,    mm4
+        psrlw       mm1,    8
+
+        punpcklbw   mm6,    mm7
+        paddw       mm3,    round_values
+
+        pmullw      mm6,    three_fourths
+        psrlw       mm3,    8
+
+        packuswb    mm1,    mm7
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi], mm0
+        movd        DWORD PTR [edi+edx], mm1
+
+
+        paddw       mm5,    mm6
+        movd        DWORD PTR [edi+edx*2], mm3
+
+        lea         eax,    [edi+edx*2]
+        paddw       mm5,    round_values
+
+        psrlw       mm5,    8
+        add         edi,    4
+
+        packuswb    mm5,    mm7
+        movd        DWORD PTR [eax+edx], mm5
+
+        add         esi,    4
+        sub         ebx,    4
+
+        jg         vs_5_4_loop
+
+        pop         ebx
+    }
+}
+
+
+__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
+__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
+
+
+static
+void horizontal_line_5_3_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+
+    (void) dest_width;
+    __asm
+    {
+
+        mov         esi,        source              ;
+        mov         edi,        dest                ;
+
+        mov         ecx,        source_width         ;
+        movq        mm5,        const53_1           ;
+
+        pxor        mm7,        mm7                 ;
+        movq        mm6,        const53_2           ;
+
+        movq        mm4,        round_values         ;
+        lea         edx,        [esi+ecx-5]         ;
+        horizontal_line_5_3_loop:
+
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        add         esi,        5
+
+        add         edi,        3
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        cmp         esi,        edx
+        packuswb    mm1,        mm7
+
+        movd        DWORD PTR [edi-3], mm1
+        jl          horizontal_line_5_3_loop
+
+//exit condition
+        movq        mm0,        QWORD PTR  [esi]    ;
+        00 01 02 03 04 05 06 07
+        movq        mm1,        mm0                 ;
+        00 01 02 03 04 05 06 07
+
+        psllw       mm0,        8                   ;
+        xx 00 xx 02 xx 04 xx 06
+        psrlw       mm1,        8                   ;
+        01 xx 03 xx 05 xx 07 xx
+
+        psrlw       mm0,        8                   ;
+        00 xx 02 xx 04 xx 06 xx
+        psllq       mm1,        16                  ;
+        xx xx 01 xx 03 xx 05 xx
+
+        pmullw      mm0,        mm6
+
+        pmullw      mm1,        mm5
+        paddw       mm1,        mm0
+
+        paddw       mm1,        mm4
+        psrlw       mm1,        8
+
+        packuswb    mm1,        mm7
+        movd        eax,        mm1
+
+        mov         edx,        eax
+        shr         edx,        16
+
+        mov         WORD PTR[edi],   ax
+        mov         BYTE PTR[edi+2], dl
+
+    }
+
+}
+
+__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
+__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
+
+static
+void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    __asm
+    {
+        push        ebx
+
+        mov         esi,    source                    // Get the source and destination pointer
+        mov         ecx,    src_pitch               // Get the pitch size
+
+        mov         edi,    dest                    // tow lines below
+        pxor        mm7,    mm7                     // clear out mm7
+
+        mov         edx,    dest_pitch               // Loop counter
+        movq        mm5,    one_thirds
+
+        movq        mm6,    two_thirds
+        mov         ebx,    dest_width;
+
+        vs_5_3_loop:
+
+        movd        mm0,    DWORD ptr [esi]         // src[0];
+        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
+
+        movd        mm2,    DWORD ptr [esi+ecx*2]
+        lea         eax,    [esi+ecx*2]             //
+
+        punpcklbw   mm1,    mm7
+        punpcklbw   mm2,    mm7
+
+        pmullw      mm1,    mm5
+        pmullw      mm2,    mm6
+
+        movd        mm3,    DWORD ptr [eax+ecx]
+        movd        mm4,    DWORD ptr [eax+ecx*2]
+
+        punpcklbw   mm3,    mm7
+        punpcklbw   mm4,    mm7
+
+        pmullw      mm3,    mm6
+        pmullw      mm4,    mm5
+
+
+        movd        DWORD PTR [edi], mm0
+        paddw       mm1,    mm2
+
+        paddw       mm1,    round_values
+        psrlw       mm1,    8
+
+        packuswb    mm1,    mm7
+        paddw       mm3,    mm4
+
+        paddw       mm3,    round_values
+        movd        DWORD PTR [edi+edx], mm1
+
+        psrlw       mm3,    8
+        packuswb    mm3,    mm7
+
+        movd        DWORD PTR [edi+edx*2], mm3
+
+
+        add         edi,    4
+        add         esi,    4
+
+        sub         ebx,    4
+        jg          vs_5_3_loop
+
+        pop         ebx
+    }
+}
+
+
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : horizontal_line_2_1_scale
+ *
+ *  INPUTS        : const unsigned char *source :
+ *                  unsigned int source_width    :
+ *                  unsigned char *dest         :
+ *                  unsigned int dest_width      :
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void horizontal_line_2_1_scale_mmx
+(
+    const unsigned char *source,
+    unsigned int source_width,
+    unsigned char *dest,
+    unsigned int dest_width
+)
+{
+    (void) dest_width;
+    (void) source_width;
+    __asm
+    {
+        mov         esi,    source
+        mov         edi,    dest
+
+        pxor        mm7,    mm7
+        mov         ecx,    dest_width
+
+        xor         edx,    edx
+        hs_2_1_loop:
+
+        movq        mm0,    [esi+edx*2]
+        psllw       mm0,    8
+
+        psrlw       mm0,    8
+        packuswb    mm0,    mm7
+
+        movd        DWORD Ptr [edi+edx], mm0;
+        add         edx,    4
+
+        cmp         edx,    ecx
+        jl          hs_2_1_loop
+
+    }
+}
+
+
+
+static
+void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+    (void) dest_pitch;
+    (void) src_pitch;
+    vpx_memcpy(dest, source, dest_width);
+}
+
+
+__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
+__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
+
+static
+void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
+{
+
+    (void) dest_pitch;
+    __asm
+    {
+        mov         esi,        source
+        mov         edi,        dest
+
+        mov         eax,        src_pitch
+        mov         edx,        dest_width
+
+        pxor        mm7,        mm7
+        sub         esi,        eax             //back one line
+
+
+        lea         ecx,        [esi+edx];
+        movq        mm6,        round_values;
+
+        movq        mm5,        three_sixteenths;
+        movq        mm4,        ten_sixteenths;
+
+        vs_2_1_i_loop:
+        movd        mm0,        [esi]           //
+        movd        mm1,        [esi+eax]       //
+
+        movd        mm2,        [esi+eax*2]     //
+        punpcklbw   mm0,        mm7
+
+        pmullw      mm0,        mm5
+        punpcklbw   mm1,        mm7
+
+        pmullw      mm1,        mm4
+        punpcklbw   mm2,        mm7
+
+        pmullw      mm2,        mm5
+        paddw       mm0,        round_values
+
+        paddw       mm1,        mm2
+        paddw       mm0,        mm1
+
+        psrlw       mm0,        8
+        packuswb    mm0,        mm7
+
+        movd        DWORD PTR [edi],        mm0
+        add         esi,        4
+
+        add         edi,        4;
+        cmp         esi,        ecx
+        jl          vs_2_1_i_loop
+
+    }
+}
+
+
+
+void
+register_mmxscalers(void)
+{
+    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
+    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
+    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
+    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
+    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
+    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
+    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
+    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
+    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
+
+    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
+    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
+    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
+    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
+    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
+    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
+
+
+
+    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
+    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
+    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
+    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
+    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
+    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
+    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
+
+
+
+
+}