doc/libsdl2-gfx-doc/_s_d_l2__image_filter_8c_source.html

 /*


 SDL2_imageFilter.c: byte-image "filter" routines


 Copyright (C) 2012-2014  Andreas Schiffler

 Copyright (C) 2013  Sylvain Beucler


 This software is provided 'as-is', without any express or implied

 warranty. In no event will the authors be held liable for any damages

 arising from the use of this software.


 Permission is granted to anyone to use this software for any purpose,

 including commercial applications, and to alter it and redistribute it

 freely, subject to the following restrictions:


    1. The origin of this software must not be misrepresented; you must not

    claim that you wrote the original software. If you use this software

    in a product, an acknowledgment in the product documentation would be

    appreciated but is not required.


    2. Altered source versions must be plainly marked as such, and must not be

    misrepresented as being the original software.


    3. This notice may not be removed or altered from any source

    distribution.


 Andreas Schiffler -- aschiffler at ferzkopp dot net


 */


 /*


 Note: Uses inline x86 MMX or ASM optimizations if available and enabled.


 Note: Most of the MMX code is based on published routines

 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to

 him for his work.


 */


 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>


 #include "SDL.h"


 /* Use GCC intrinsics if available: they support both i386 and x86_64,

    provide ASM-grade performances, and lift the PUSHA/POPA issues. */

 #ifdef __GNUC__

 #  ifdef USE_MMX

 #    include <mmintrin.h>

 #  endif

 #  include <SDL_cpuinfo.h>

 #endif


 #include "SDL2_imageFilter.h"


 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))


 /* ------ Static variables ----- */


 static int SDL_imageFilterUseMMX = 1;


 /* Detect GCC */

 #if defined(__GNUC__)

 #define GCC__

 #endif


 int SDL_imageFilterMMXdetect(void)

 {

         /* Check override flag */

         if (SDL_imageFilterUseMMX == 0) {

                 return (0);

         }


     return SDL_HasMMX();

 }


 void SDL_imageFilterMMXoff()

 {

         SDL_imageFilterUseMMX = 0;

 }


 void SDL_imageFilterMMXon()

 {

         SDL_imageFilterUseMMX = 1;

 }


 /* ------------------------------------------------------------------------------------ */


 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1   /* load Src1 address into eax */

                         mov ebx, Src2   /* load Src2 address into ebx */

                         mov edi, Dest   /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16        /* 16 byte alignment of the loop entry */

 L1010:

                 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */

                 paddusb mm1, [ebx]      /* mm1=Src1+Src2 (add 8 bytes with saturation) */

                 movq [edi], mm1 /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi, 8

                         dec ecx /* decrease loop counter */

                         jnz L1010       /* check loop termination, proceed if required */

                         emms /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_paddusb(*mSrc1, *mSrc2);    /* Src1+Src2 (add 8 bytes with saturation) */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* Use MMX assembly routine */

                 SDL_imageFilterAddMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 + (int) *cursrc2;

                 if (result > 255)

                         result = 255;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,

                                                    unsigned char *Mask)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov edx, Mask /* load Mask address into edx */

                         movq mm0, [edx] /* load Mask into mm0 */

                 mov eax, Src1 /* load Src1 address into eax */

                         mov ebx, Src2 /* load Src2 address into ebx */

                         mov edi, Dest /* load Dest address into edi */

                         mov ecx, SrcLength /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16        /* 16 byte alignment of the loop entry */

 L21011:

                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */

                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */

                 /* --- Byte shift via Word shift --- */

                 psrlw mm1, 1    /* shift 4 WORDS of mm1 1 bit to the right */

                         psrlw mm2, 1    /* shift 4 WORDS of mm2 1 bit to the right */

                         pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */

                         /* byte     0x0f, 0xdb, 0xc8 */

                         pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */

                         /* byte     0x0f, 0xdb, 0xd0 */

                         paddusb mm1,  mm2       /* mm1=mm1+mm2 (add 8 bytes with saturation) */

                         movq [edi],  mm1        /* store result in Dest */

                         add eax,  8     /* increase Src1, Src2 and Dest  */

                         add ebx,  8     /* register pointers by 8 */

                         add edi,  8

                         dec ecx         /* decrease loop counter */

                         jnz L21011      /* check loop termination, proceed if required */

                         emms    /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         __m64 *mMask = (__m64*)Mask;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm1 = *mSrc1,

                       mm2 = *mSrc2;

                 mm1 = _m_psrlwi(mm1, 1);        /* shift 4 WORDS of mm1 1 bit to the right */

                 mm2 = _m_psrlwi(mm2, 1);        /* shift 4 WORDS of mm2 1 bit to the right */

                 mm1 = _m_pand(mm1, *mMask);     /* apply Mask to 8 BYTES of mm1 */

                 mm2 = _m_pand(mm2, *mMask);     /* apply Mask to 8 BYTES of mm2 */

                 *mDest = _m_paddusb(mm1, mm2);  /* mm1+mm2 (add 8 bytes with saturation) */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                             /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax,  Src1  /* load Src1 address into eax */

                         mov ebx,  Src2  /* load Src2 address into ebx */

                         mov edi,  Dest  /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16 /* 16 byte alignment of the loop entry */

 L1012:

                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */

                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */

                 movq [edi],  mm1        /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi, 8

                         dec ecx /* decrease loop counter */

                         jnz L1012       /* check loop termination, proceed if required */

                         emms /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_psubusb(*mSrc1, *mSrc2);    /* Src1-Src2 (sub 8 bytes with saturation) */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterSubMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 - (int) *cursrc2;

                 if (result < 0)

                         result = 0;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1   /* load Src1 address into eax */

                         mov ebx, Src2   /* load Src2 address into ebx */

                         mov edi, Dest   /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16        /* 16 byte alignment of the loop entry */

 L1013:

                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */

                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */

                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */

                 psubusb mm2,  [eax]     /* mm2=Src2-Src1 (sub 8 bytes with saturation) */

                 por mm1,  mm2   /* combine both mm2 and mm1 results */

                         movq [edi],  mm1        /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi, 8

                         dec ecx         /* decrease loop counter */

                         jnz L1013       /* check loop termination, proceed if required */

                         emms         /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */

                 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */

                 *mDest = _m_por(mm1, mm2);              /* combine both mm2 and mm1 results */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = abs((int) *cursrc1 - (int) *cursrc2);

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1   /* load Src1 address into eax */

                         mov ebx, Src2   /* load Src2 address into ebx */

                         mov edi, Dest   /* load Dest address into edi */

                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */

                         shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */

                         pxor mm0, mm0   /* zero mm0 register */

                         align 16        /* 16 byte alignment of the loop entry */

 L1014:

                 movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */

                 movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */

                 movq mm2, mm1   /* copy mm1 into mm2 */

                         movq mm4, mm3   /* copy mm3 into mm4  */

                         punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */

                         punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */

                         punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */

                         punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */

                         pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */

                         pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */

                         /* Take abs value of the results (signed words) */

                         movq mm5, mm1   /* copy mm1 into mm5 */

                         movq mm6, mm2   /* copy mm2 into mm6 */

                         psraw mm5, 15   /* fill mm5 words with word sign bit */

                         psraw mm6, 15   /* fill mm6 words with word sign bit */

                         pxor mm1, mm5   /* take 1's compliment of only neg. words */

                         pxor mm2, mm6   /* take 1's compliment of only neg. words */

                         psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */

                         psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */

                         packuswb mm1, mm2   /* pack words back into bytes with saturation */

                         movq [edi], mm1   /* store result in Dest */

                         add eax, 8   /* increase Src1, Src2 and Dest  */

                         add ebx, 8   /* register pointers by 8 */

                         add edi, 8

                         dec ecx         /* decrease loop counter */

                         jnz L1014       /* check loop termination, proceed if required */

                         emms /* exit MMX state */

                         popa

         }

 #else

         /* i386 ASM with constraints: */

         /* asm volatile ( */

         /*      "shr $3, %%ecx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */

         /*      "pxor      %%mm0, %%mm0 \n\t"   /\* zero mm0 register *\/ */

         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */

         /*      "1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */

         /*      "movq    (%%ebx), %%mm3 \n\t"   /\* load 8 bytes from Src2 into mm3 *\/ */

         /*      "movq      %%mm1, %%mm2 \n\t"   /\* copy mm1 into mm2 *\/ */

         /*      "movq      %%mm3, %%mm4 \n\t"   /\* copy mm3 into mm4  *\/ */

         /*      "punpcklbw %%mm0, %%mm1 \n\t"   /\* unpack low  bytes of Src1 into words *\/ */

         /*      "punpckhbw %%mm0, %%mm2 \n\t"   /\* unpack high bytes of Src1 into words *\/ */

         /*      "punpcklbw %%mm0, %%mm3 \n\t"   /\* unpack low  bytes of Src2 into words *\/ */

         /*      "punpckhbw %%mm0, %%mm4 \n\t"   /\* unpack high bytes of Src2 into words *\/ */

         /*      "pmullw    %%mm3, %%mm1 \n\t"   /\* mul low  bytes of Src1 and Src2  *\/ */

         /*      "pmullw    %%mm4, %%mm2 \n\t"   /\* mul high bytes of Src1 and Src2 *\/ */

         /*      /\* Take abs value of the results (signed words) *\/ */

         /*      "movq      %%mm1, %%mm5 \n\t"   /\* copy mm1 into mm5 *\/ */

         /*      "movq      %%mm2, %%mm6 \n\t"   /\* copy mm2 into mm6 *\/ */

         /*      "psraw       $15, %%mm5 \n\t"   /\* fill mm5 words with word sign bit *\/ */

         /*      "psraw       $15, %%mm6 \n\t"   /\* fill mm6 words with word sign bit *\/ */

         /*      "pxor      %%mm5, %%mm1 \n\t"   /\* take 1's compliment of only neg. words *\/ */

         /*      "pxor      %%mm6, %%mm2 \n\t"   /\* take 1's compliment of only neg. words *\/ */

         /*      "psubsw    %%mm5, %%mm1 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */

         /*      "psubsw    %%mm6, %%mm2 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */

         /*      "packuswb  %%mm2, %%mm1 \n\t"   /\* pack words back into bytes with saturation *\/ */

         /*      "movq    %%mm1, (%%edi) \n\t"   /\* store result in Dest *\/ */

         /*      "add $8, %%eax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */

         /*      "add $8, %%ebx \n\t"    /\* register pointers by 8 *\/ */

         /*      "add $8, %%edi \n\t" */

         /*      "dec %%ecx     \n\t"    /\* decrease loop counter *\/ */

         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */

         /*      "emms          \n\t"    /\* exit MMX state *\/ */

         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */

         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */

         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */

         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */

         /*      : */

         /*      : "memory",             /\* *Dest is modified *\/ */

         /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"  /\* registers modified *\/ */

         /* ); */


         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0); /* zero mm0 register */

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;

                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */

                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */

                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */

                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */

                 mm5 = _m_psrawi(mm1, 15);               /* fill mm5 words with word sign bit */

                 mm6 = _m_psrawi(mm2, 15);               /* fill mm6 words with word sign bit */

                 mm1 = _m_pxor(mm1, mm5);                /* take 1's compliment of only neg. words */

                 mm2 = _m_pxor(mm2, mm6);                /* take 1's compliment of only neg. words */

                 mm1 = _m_psubsw(mm1, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */

                 mm2 = _m_psubsw(mm2, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */

                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterMultMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {


                 /* NOTE: this is probably wrong - dunno what the MMX code does */


                 result = (int) *cursrc1 * (int) *cursrc2;

                 if (result > 255)

                         result = 255;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov edx, Src1   /* load Src1 address into edx */

                         mov esi, Src2   /* load Src2 address into esi */

                         mov edi, Dest   /* load Dest address into edi */

                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */

                         align 16        /* 16 byte alignment of the loop entry */

 L10141:

                 mov al, [edx]   /* load a byte from Src1 */

                 mul [esi]       /* mul with a byte from Src2 */

                 mov [edi], al   /* move a byte result to Dest */

                         inc edx         /* increment Src1, Src2, Dest */

                         inc esi                 /* pointer registers by one */

                         inc edi

                         dec ecx /* decrease loop counter */

                         jnz L10141      /* check loop termination, proceed if required */

                         popa

         }

 #else

         /* Note: ~5% gain on i386, less efficient than C on x86_64 */

         /* Also depends on whether this function is static (?!) */

         asm volatile (

                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */

 #  if defined(i386)

                 "1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */

                 "mulb (%%esi)       \n\t"       /* mul with a byte from Src2 */

                 "mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */

                 "inc %%edx \n\t"                /* increment Src1, Src2, Dest */

                 "inc %%esi \n\t"                /* pointer registers by one */

                 "inc %%edi \n\t"

                 "dec %%ecx      \n\t"   /* decrease loop counter */

 #  elif defined(__x86_64__)

                 "1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */

                 "mulb (%%rsi)       \n\t"       /* mul with a byte from Src2 */

                 "mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */

                 "inc %%rdx \n\t"                /* increment Src1, Src2, Dest */

                 "inc %%rsi \n\t"                /* pointer registers by one */

                 "inc %%rdi \n\t"

                 "dec %%rcx      \n\t"   /* decrease loop counter */

 #  endif

                 "jnz 1b         \n\t"   /* check loop termination, proceed if required */

                 : "+d" (Src1),          /* load Src1 address into edx */

                   "+S" (Src2),          /* load Src2 address into esi */

                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */

                   "+D" (Dest)           /* load Dest address into edi */

                 :

                 : "memory", "rax"

                 );

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (SDL_imageFilterMMXdetect()) {

                 if (length > 0) {

                         /* ASM routine */

                         SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);


                         /* Check for unaligned bytes */

                         if ((length & 7) > 0) {

                                 /* Setup to process unaligned bytes */

                                 istart = length & 0xfffffff8;

                                 cursrc1 = &Src1[istart];

                                 cursrc2 = &Src2[istart];

                                 curdst = &Dest[istart];

                         } else {

                                 /* No unaligned bytes - we are done */

                                 return (0);

                         }

                 } else {

                         /* No bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov ebx, Src2           /* load Src2 address into ebx */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         pxor mm0,  mm0  /* zero mm0 register */

                         align 16                /* 16 byte alignment of the loop entry */

 L1015:

                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */

                 movq mm3,  [ebx]        /* load 8 bytes from Src2 into mm3 */

                 movq mm2,  mm1  /* copy mm1 into mm2 */

                         movq mm4,  mm3  /* copy mm3 into mm4  */

                         punpcklbw mm1,  mm0     /* unpack low  bytes of Src1 into words */

                         punpckhbw mm2,  mm0     /* unpack high bytes of Src1 into words */

                         punpcklbw mm3,  mm0     /* unpack low  bytes of Src2 into words */

                         punpckhbw mm4,  mm0     /* unpack high bytes of Src2 into words */

                         psrlw mm1,  1   /* divide mm1 words by 2, Src1 low bytes */

                         psrlw mm2,  1   /* divide mm2 words by 2, Src1 high bytes */

                         pmullw mm1,  mm3        /* mul low  bytes of Src1 and Src2  */

                         pmullw mm2,  mm4        /* mul high bytes of Src1 and Src2 */

                         packuswb mm1,  mm2      /* pack words back into bytes with saturation */

                         movq [edi],  mm1        /* store result in Dest */

                         add eax,  8     /* increase Src1, Src2 and Dest  */

                         add ebx,  8     /* register pointers by 8 */

                         add edi,  8

                         dec ecx         /* decrease loop counter */

                         jnz L1015               /* check loop termination, proceed if required */

                         emms                    /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0); /* zero mm0 register */

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;

                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */

                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */

                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */

                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */

                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */

                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */

                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = ((int) *cursrc1 / 2) * (int) *cursrc2;

                 if (result > 255)

                         result = 255;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov ebx, Src2           /* load Src2 address into ebx */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         pxor mm0, mm0           /* zero mm0 register */

                         align 16                /* 16 byte alignment of the loop entry */

 L1016:

                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */

                 movq mm3, [ebx]         /* load 8 bytes from Src2 into mm3 */

                 movq mm2, mm1           /* copy mm1 into mm2 */

                         movq mm4, mm3           /* copy mm3 into mm4  */

                         punpcklbw mm1, mm0      /* unpack low  bytes of Src1 into words */

                         punpckhbw mm2, mm0      /* unpack high bytes of Src1 into words */

                         punpcklbw mm3, mm0      /* unpack low  bytes of Src2 into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of Src2 into words */

                         psrlw mm1, 1    /* divide mm1 words by 2, Src1 low bytes */

                         psrlw mm2, 1    /* divide mm2 words by 2, Src1 high bytes */

                         psrlw mm3, 1    /* divide mm3 words by 2, Src2 low bytes */

                         psrlw mm4, 1    /* divide mm4 words by 2, Src2 high bytes */

                         pmullw mm1, mm3         /* mul low  bytes of Src1 and Src2  */

                         pmullw mm2, mm4         /* mul high bytes of Src1 and Src2 */

                         packuswb mm1, mm2       /* pack words back into bytes with saturation */

                         movq [edi], mm1         /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi,  8

                         dec ecx         /* decrease loop counter */

                         jnz L1016               /* check loop termination, proceed if required */

                         emms                    /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0); /* zero mm0 register */

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;

                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */

                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */

                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */

                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */

                 mm3 = _m_psrlwi(mm3, 1);                /* divide mm3 words by 2, Src2 low bytes */

                 mm4 = _m_psrlwi(mm4, 1);                /* divide mm4 words by 2, Src2 high bytes */

                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */

                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */

                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);

                 if (result > 255)

                         result = 255;

                 *curdst = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov ebx, Src2           /* load Src2 address into ebx */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                /* 16 byte alignment of the loop entry */

 L1017:

                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */

                 pand mm1, [ebx]         /* mm1=Src1&Src2 */

                 movq [edi], mm1         /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi, 8

                         dec ecx         /* decrease loop counter */

                         jnz L1017               /* check loop termination, proceed if required */

                         emms                    /* exit MMX state */

                         popa

         }

 #else

         /* x86_64 ASM with constraints: */

         /* asm volatile ( */

         /*      "shr $3, %%rcx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */

         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */

         /*      "1: movq (%%rax), %%mm1 \n\t"   /\* load 8 bytes from Src1 into mm1 *\/ */

         /*      "pand    (%%rbx), %%mm1 \n\t"   /\* mm1=Src1&Src2 *\/ */

         /*      "movq    %%mm1, (%%rdi) \n\t"   /\* store result in Dest *\/ */

         /*      "add $8, %%rax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */

         /*      "add $8, %%rbx \n\t"    /\* register pointers by 8 *\/ */

         /*      "add $8, %%rdi \n\t" */

         /*      "dec %%rcx     \n\t"    /\* decrease loop counter *\/ */

         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */

         /*      "emms          \n\t"    /\* exit MMX state *\/ */

         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */

         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */

         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */

         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */

         /*      : */

         /*      : "memory",             /\* *Dest is modified *\/ */

         /*           "mm1"                      /\* register mm1 modified *\/ */

         /* ); */


         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_pand(*mSrc1, *mSrc2);       /* Src1&Src2 */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {

                 /*  if (length > 7) { */

                 /* Call MMX routine */


                 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {


                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdst = (*cursrc1) & (*cursrc2);

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov ebx, Src2           /* load Src2 address into ebx */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                /* 16 byte alignment of the loop entry */

 L91017:

                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */

                 por mm1, [ebx]          /* mm1=Src1|Src2 */

                 movq [edi], mm1         /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add ebx, 8      /* register pointers by 8 */

                         add edi,  8

                         dec ecx         /* decrease loop counter */

                         jnz L91017              /* check loop termination, proceed if required */

                         emms                    /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mSrc2 = (__m64*)Src2;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_por(*mSrc1, *mSrc2);        /* Src1|Src2 */

                 mSrc1++;

                 mSrc2++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         cursrc2 = &Src2[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 cursrc2 = Src2;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdst = *cursrc1 | *cursrc2;

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }

         return (0);

 }


 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov edx, Src1           /* load Src1 address into edx */

                         mov esi, Src2           /* load Src2 address into esi */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         align 16                /* 16 byte alignment of the loop entry */

 L10191:

                 mov bl, [esi]           /* load a byte from Src2 */

                 cmp bl, 0       /* check if it zero */

                         jnz L10192

                         mov [edi], 255          /* division by zero = 255 !!! */

                         jmp  L10193

 L10192:

                 xor ah, ah      /* prepare AX, zero AH register */

                         mov al, [edx]           /* load a byte from Src1 into AL */

                 div   bl                /* divide AL by BL */

                         mov [edi], al           /* move a byte result to Dest */

 L10193:

                 inc edx         /* increment Src1, Src2, Dest */

                         inc esi                 /* pointer registers by one */

                         inc edi

                         dec ecx         /* decrease loop counter */

                         jnz L10191      /* check loop termination, proceed if required */

                         popa

         }

 #else

         /* Note: ~15% gain on i386, less efficient than C on x86_64 */

         /* Also depends on whether the function is static (?!) */

         /* Also depends on whether we work on malloc() or static char[] */

         asm volatile (

 #  if defined(i386)

                 "pushl %%ebx \n\t"              /* %ebx may be the PIC register.  */

                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */

                 "1: mov (%%esi), %%bl  \n\t"    /* load a byte from Src2 */

                 "cmp       $0, %%bl    \n\t"    /* check if it zero */

                 "jnz 2f                \n\t"

                 "movb  $255, (%%edi)   \n\t"    /* division by zero = 255 !!! */

                 "jmp 3f                \n\t"

                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */

                 "mov   (%%edx), %%al   \n\t"    /* load a byte from Src1 into AL */

                 "div   %%bl            \n\t"    /* divide AL by BL */

                 "mov   %%al, (%%edi)   \n\t"    /* move a byte result to Dest */

                 "3: inc %%edx          \n\t"    /* increment Src1, Src2, Dest */

                 "inc %%esi \n\t"                /* pointer registers by one */

                 "inc %%edi \n\t"

                 "dec %%ecx \n\t"                /* decrease loop counter */

                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */

                 "popl %%ebx \n\t"               /* restore %ebx */

                 : "+d" (Src1),          /* load Src1 address into edx */

                   "+S" (Src2),          /* load Src2 address into esi */

                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */

                   "+D" (Dest)           /* load Dest address into edi */

                 :

                 : "memory", "rax"

 #  elif defined(__x86_64__)

                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */

                 "1: mov (%%rsi), %%bl  \n\t"    /* load a byte from Src2 */

                 "cmp       $0, %%bl    \n\t"    /* check if it zero */

                 "jnz 2f                \n\t"

                 "movb  $255, (%%rdi)   \n\t"    /* division by zero = 255 !!! */

                 "jmp 3f                \n\t"

                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */

                 "mov   (%%rdx), %%al   \n\t"    /* load a byte from Src1 into AL */

                 "div   %%bl            \n\t"    /* divide AL by BL */

                 "mov   %%al, (%%rdi)   \n\t"    /* move a byte result to Dest */

                 "3: inc %%rdx          \n\t"    /* increment Src1, Src2, Dest */

                 "inc %%rsi \n\t"                /* pointer registers by one */

                 "inc %%rdi \n\t"

                 "dec %%rcx \n\t"                /* decrease loop counter */

                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */

                 : "+d" (Src1),          /* load Src1 address into edx */

                   "+S" (Src2),          /* load Src2 address into esi */

                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */

                   "+D" (Dest)           /* load Dest address into edi */

                 :

                 : "memory", "rax", "rbx"

 #  endif

                 );

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *cursrc2, *curdst;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (SDL_imageFilterMMXdetect()) {

                 if (length > 0) {

                         /* Call ASM routine */

                         SDL_imageFilterDivASM(Src1, Src2, Dest, length);


                         /* Never unaligned bytes - we are done */

                         return (0);

                 } else {

                         return (-1);

                 }

         }


         /* Setup to process whole image */

         istart = 0;

         cursrc1 = Src1;

         cursrc2 = Src2;

         curdst = Dest;


         /* C routine to process image */

         /* for (i = istart; i < length; i++) { */

         /*      if (*cursrc2 == 0) { */

         /*              *curdst = 255; */

         /*      } else { */

         /*              result = (int) *cursrc1 / (int) *cursrc2; */

         /*              *curdst = (unsigned char) result; */

         /*      } */

         /*      /\* Advance pointers *\/ */

         /*      cursrc1++; */

         /*      cursrc2++; */

         /*      curdst++; */

         /* } */

         for (i = istart; i < length; i++) {

                 if (*cursrc2 == 0) {

                         *curdst = 255;

                 } else {

                         *curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency

                 }

                 /* Advance pointers */

                 cursrc1++;

                 cursrc2++;

                 curdst++;

         }


         return (0);

 }


 /* ------------------------------------------------------------------------------------ */


 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                /* 16 byte alignment of the loop entry */

 L91117:

                 movq mm0, [eax]         /* load 8 bytes from Src1 into mm1 */

                 pxor mm0, mm1           /* negate mm0 by xoring with mm1 */

                         movq [edi], mm0         /* store result in Dest */

                         add eax, 8      /* increase Src1, Src2 and Dest  */

                         add edi,  8

                         dec ecx         /* decrease loop counter */

                         jnz L91117              /* check loop termination, proceed if required */

                         emms                    /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm1;

         mm1 = _m_pcmpeqb(mm1, mm1);             /* generate all 1's in mm1 */

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_pxor(*mSrc1, mm1);  /* negate mm0 by xoring with mm1 */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                             /* clean MMX state */


 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *curdst;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

                 /* MMX routine */

                 SDL_imageFilterBitNegationMMX(Src1, Dest, length);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdst = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdst = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdst = ~(*cursrc1);

                 /* Advance pointers */

                 cursrc1++;

                 curdst++;

         }


         return (0);

 }


 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate C in 8 bytes of MM1 ** */

                         mov al, C       /* load C into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1021:

                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */

                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */

                         movq [edi], mm0         /* store result in Dest */

                         add eax, 8      /* increase Dest register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1021           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         /* Duplicate C in 8 bytes of MM1 */

         int i;

         memset(&i, C, 4);

         __m64 mm1 = _m_from_int(i);

         __m64 mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)

 {

         unsigned int i, istart;

         int iC;

         unsigned char *cursrc1, *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: C==0 */

         if (C == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 SDL_imageFilterAddByteMMX(Src1, Dest, length, C);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC = (int) C;

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 + iC;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }

         return (0);

 }


 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */

                         mov eax, C      /* load C into EAX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         mov eax, D      /* load D into EAX */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L11023:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */

                         movq [edi],  mm0        /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L11023          /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         /* Duplicate (int)C in 8 bytes of MM1 */

         __m64 mm1 = _m_from_int(C);

         __m64 mm2 = _m_from_int(C);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)

 {

         unsigned int i, j, istart, D;

         int iC[4];

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: C==0 */

         if (C == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 D=SWAP_32(C);

                 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process bytes */

         iC[3] = (int) ((C >> 24) & 0xff);

         iC[2] = (int) ((C >> 16) & 0xff);

         iC[1] = (int) ((C >>  8) & 0xff);

         iC[0] = (int) ((C >>  0) & 0xff);

         for (i = istart; i < length; i += 4) {

                 for (j = 0; j < 4; j++) {

                         if ((i+j)<length) {

                                 result = (int) *cursrc1 + iC[j];

                                 if (result > 255) result = 255;

                                 *curdest = (unsigned char) result;

                                 /* Advance pointers */

                                 cursrc1++;

                                 curdest++;

                         }

                 }

         }

         return (0);

 }


 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,

                                                                         unsigned char *Mask)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate C in 8 bytes of MM1 ** */

                         mov al, C       /* load C into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */

                         mov edx, Mask           /* load Mask address into edx */

                         movq mm0, [edx]         /* load Mask into mm0 */

                 mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1022:

                 movq mm2, [eax]         /* load 8 bytes from Src1 into MM2 */

                 psrlw mm2, 1    /* shift 4 WORDS of MM2 1 bit to the right */

                         pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */

                         paddusb mm2,  mm1       /* MM2=SrcDest+C (add 8 bytes with saturation) */

                         movq [edi], mm2         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1022           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 *mMask = (__m64*)Mask;

         /* Duplicate C in 8 bytes of MM1 */

         int i;

         memset(&i, C, 4);

         __m64 mm1 = _m_from_int(i);

         __m64 mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm2 = _m_psrlwi(*mSrc1, 1);       /* shift 4 WORDS of MM2 1 bit to the right */

                 mm2 = _m_pand(mm2, *mMask);             /* apply Mask to 8 BYTES of MM2 */

                                                         /* byte     0x0f, 0xdb, 0xd0 */

                 *mDest = _m_paddusb(mm1, mm2);          /* Src1+C (add 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)

 {

         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };

         unsigned int i, istart;

         int iC;

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC = (int) C;

         for (i = istart; i < length; i++) {

                 result = (int) (*cursrc1 / 2) + iC;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate C in 8 bytes of MM1 ** */

                         mov al, C       /* load C into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1023:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 psubusb mm0,  mm1       /* MM0=SrcDest-C (sub 8 bytes with saturation) */

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1023           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         /* Duplicate C in 8 bytes of MM1 */

         int i;

         memset(&i, C, 4);

         __m64 mm1 = _m_from_int(i);

         __m64 mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)

 {

         unsigned int i, istart;

         int iC;

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: C==0 */

         if (C == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 SDL_imageFilterSubByteMMX(Src1, Dest, length, C);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC = (int) C;

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 - iC;

                 if (result < 0)

                         result = 0;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }

         return (0);

 }


 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */

                         mov eax, C      /* load C into EAX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         mov eax, D      /* load D into EAX */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L11024:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 psubusb mm0, mm1        /* MM0=SrcDest-C (sub 8 bytes with saturation) */

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L11024          /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         /* Duplicate (int)C in 8 bytes of MM1 */

         __m64 mm1 = _m_from_int(C);

         __m64 mm2 = _m_from_int(C);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)

 {

         unsigned int i, j, istart, D;

         int iC[4];

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


     /* Special case: C==0 */

         if (C == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 D=SWAP_32(C);

                 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC[3] = (int) ((C >> 24) & 0xff);

         iC[2] = (int) ((C >> 16) & 0xff);

         iC[1] = (int) ((C >>  8) & 0xff);

         iC[0] = (int) ((C >>  0) & 0xff);

         for (i = istart; i < length; i += 4) {

                 for (j = 0; j < 4; j++) {

                         if ((i+j)<length) {

                                 result = (int) *cursrc1 - iC[j];

                                 if (result < 0) result = 0;

                                 *curdest = (unsigned char) result;

                                 /* Advance pointers */

                                 cursrc1++;

                                 curdest++;

                         }

                 }

         }

         return (0);

 }


 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,

                                                                  unsigned char *Mask)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov edx, Mask           /* load Mask address into edx */

                         movq mm0, [edx]         /* load Mask into mm0 */

                 xor ecx, ecx    /* zero ECX */

                         mov cl,  N      /* load loop counter (N) into CL */

                         movd mm3,  ecx  /* copy (N) into MM3  */

                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */

 L10240:                         /* ** Prepare proper bit-Mask in MM1 ** */

                 psrlw mm1,  1   /* shift 4 WORDS of MM1 1 bit to the right */

                         pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */

                         /*  byte     0x0f, 0xdb, 0xc8 */

                         dec               cl            /* decrease loop counter */

                         jnz            L10240           /* check loop termination, proceed if required */

                         /* ** Shift all bytes of the image ** */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */

                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L10241:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 psrlw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the right */

                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */

                         /* byte     0x0f, 0xdb, 0xc1 */

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10241           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 *mMask = (__m64*)Mask;

         __m64 mm1;

         int i;

         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */

         /* Prepare proper bit-Mask in MM1 */

         for (i = 0; i < N; i++) {

                 mm1 = _m_psrlwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the right */

                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */

         }

         /* Shift all bytes of the image */

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm0 = _m_psrlwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the right */

                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)

 {

         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };

         unsigned int i, istart;

         unsigned char *cursrc1;

         unsigned char *curdest;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Check shift */

         if (N > 8) {

                 return (-1);

         }


         /* Special case: N==0 */

         if (N == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 /* MMX routine */

                 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdest = (unsigned char) *cursrc1 >> N;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L13023:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 psrld mm0, N

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L13023          /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_psrldi(*mSrc1, N);

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *curdest;

         unsigned int *icursrc1, *icurdest;

         unsigned int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (N > 32) {

                 return (-1);

         }


         /* Special case: N==0 */

         if (N == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         icursrc1=(unsigned int *)cursrc1;

         icurdest=(unsigned int *)curdest;

         for (i = istart; i < length; i += 4) {

                 if ((i+4)<length) {

                         result = ((unsigned int)*icursrc1 >> N);

                         *icurdest = result;

                 }

                 /* Advance pointers */

                 icursrc1++;

                 icurdest++;

         }


         return (0);

 }


 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate C in 4 words of MM1 ** */

                         mov al, C       /* load C into AL */

                         xor ah, ah      /* zero AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */

                         pxor mm0, mm0           /* zero MM0 register */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         cmp al, 128     /* if (C <= 128) execute more efficient code */

                         jg             L10251

                         align 16                        /* 16 byte alignment of the loop entry */

 L10250:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */

                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */

                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10250           /* check loop termination, proceed if required */

                         jmp            L10252

                         align 16                        /* 16 byte alignment of the loop entry */

 L10251:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */

                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */

                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */

                         /* ** Take abs value of the results (signed words) ** */

                         movq mm5, mm3           /* copy mm3 into mm5 */

                         movq mm6, mm4           /* copy mm4 into mm6 */

                         psraw mm5, 15           /* fill mm5 words with word sign bit */

                         psraw mm6, 15           /* fill mm6 words with word sign bit */

                         pxor mm3, mm5           /* take 1's compliment of only neg words */

                         pxor mm4, mm6           /* take 1's compliment of only neg words */

                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */

                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10251           /* check loop termination, proceed if required */

 L10252:

                 emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */

         /* Duplicate C in 4 words of MM1 */

         int i;

         i = C | C<<16;

         __m64 mm1 = _m_from_int(i);

         __m64 mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                           /* fill higher words of MM1 with C */

         // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;

         //__m64 mm1 = _m_from_int64(lli); // x86_64 only

         if (C <= 128) {                                         /* if (C <= 128) execute more efficient code */

                 for (i = 0; i < SrcLength/8; i++) {

                         __m64 mm3, mm4;

                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */

                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */

                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                         mSrc1++;

                         mDest++;

                 }

         } else {

                 for (i = 0; i < SrcLength/8; i++) {

                         __m64 mm3, mm4, mm5, mm6;

                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */

                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */

                         /* Take abs value of the results (signed words) */

                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */

                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */

                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */

                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */

                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */

                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */

                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                         mSrc1++;

                         mDest++;

                 }

         }

         _m_empty();                                             /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)

 {

         unsigned int i, istart;

         int iC;

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: C==1 */

         if (C == 1) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC = (int) C;

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 * iC;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,

                                                                                           unsigned char C)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate C in 4 words of MM1 ** */

                         mov al, C       /* load C into AL */

                         xor ah, ah      /* zero AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */

                         xor ecx, ecx    /* zero ECX */

                         mov cl, N       /* load N into CL */

                         movd mm7, ecx           /* copy N into MM7 */

                         pxor mm0, mm0           /* zero MM0 register */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1026:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */

                         psrlw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */

                         psrlw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */

                         pmullw mm3, mm1         /* mul low  bytes of SrcDest by MM1 */

                         pmullw mm4, mm1         /* mul high bytes of SrcDest by MM1 */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1026           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0);                     /* zero mm0 register */

         /* Duplicate C in 4 words of MM1 */

         int i;

         i = (C<<16)|C;

         __m64 mm1 = _m_from_int(i);

         __m64 mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with C */

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm3, mm4, mm5, mm6;

                 mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                 mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                 mm3 = _m_psrlwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the right */

                 mm4 = _m_psrlwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the right */

                 mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */

                 mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */

                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,

                                                                                    unsigned char C)

 {

         unsigned int i, istart;

         int iC;

         unsigned char *cursrc1;

         unsigned char *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Check shift */

         if (N > 8) {

                 return (-1);

         }


         /* Special case: N==0 && C==1 */

         if ((N == 0) && (C == 1)) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         iC = (int) C;

         for (i = istart; i < length; i++) {

                 result = (int) (*cursrc1 >> N) * iC;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,

                                                                         unsigned char *Mask)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov edx, Mask           /* load Mask address into edx */

                         movq mm0, [edx]         /* load Mask into mm0 */

                 xor ecx, ecx    /* zero ECX */

                         mov cl, N       /* load loop counter (N) into CL */

                         movd mm3, ecx           /* copy (N) into MM3  */

                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */

 L10270:                         /* ** Prepare proper bit-Mask in MM1 ** */

                 psllw mm1, 1    /* shift 4 WORDS of MM1 1 bit to the left */

                         pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */

                         /*  byte     0x0f, 0xdb, 0xc8 */

                         dec cl                          /* decrease loop counter */

                         jnz            L10270           /* check loop termination, proceed if required */

                         /* ** Shift all bytes of the image ** */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load SrcDest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L10271:

                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */

                 psllw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the left */

                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */

                         /* byte     0x0f, 0xdb, 0xc1 */

                         movq [edi], mm0         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10271           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 *mMask = (__m64*)Mask;

         __m64 mm1;

         int i;

         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */

         /* Prepare proper bit-Mask in MM1 */

         for (i = 0; i < N; i++) {

                 mm1 = _m_psllwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the left */

                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */

         }

         /* ** Shift all bytes of the image ** */

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm0 = _m_psllwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the left */

                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)

 {

         static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };

         unsigned int i, istart;

         unsigned char *cursrc1, *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (N > 8) {

                 return (-1);

         }


         /* Special case: N==0 */

         if (N == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = ((int) *cursrc1 << N) & 0xff;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L12023:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 pslld mm0, N    /* MM0=SrcDest+C (add 8 bytes with saturation) */

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L12023          /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         int i;

         for (i = 0; i < SrcLength/8; i++) {

                 *mDest = _m_pslldi(*mSrc1, N);  /* Src1+C (add 8 bytes with saturation) */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                             /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *curdest;

         unsigned int *icursrc1, *icurdest;

         unsigned int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (N > 32) {

                 return (-1);

         }


         /* Special case: N==0 */

         if (N == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         icursrc1=(unsigned int *)cursrc1;

         icurdest=(unsigned int *)curdest;

         for (i = istart; i < length; i += 4) {

                 if ((i+4)<length) {

                         result = ((unsigned int)*icursrc1 << N);

                         *icurdest = result;

                 }

                 /* Advance pointers */

                 icursrc1++;

                 icurdest++;

         }


         return (0);

 }


 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         xor eax, eax    /* zero EAX */

                         mov al, N       /* load N into AL */

                         movd mm7, eax           /* copy N into MM7 */

                         pxor mm0, mm0           /* zero MM0 register */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         cmp al, 7       /* if (N <= 7) execute more efficient code */

                         jg             L10281

                         align 16                        /* 16 byte alignment of the loop entry */

 L10280:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */

                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */

                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10280           /* check loop termination, proceed if required */

                         jmp            L10282

                         align 16                        /* 16 byte alignment of the loop entry */

 L10281:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */

                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */

                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */

                         /* ** Take abs value of the signed words ** */

                         movq mm5, mm3           /* copy mm3 into mm5 */

                         movq mm6, mm4           /* copy mm4 into mm6 */

                         psraw mm5, 15           /* fill mm5 words with word sign bit */

                         psraw mm6, 15           /* fill mm6 words with word sign bit */

                         pxor mm3, mm5           /* take 1's compliment of only neg words */

                         pxor mm4, mm6           /* take 1's compliment of only neg words */

                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */

                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz            L10281           /* check loop termination, proceed if required */

 L10282:

                 emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */

         int i;

         if (N <= 7) {                                           /* if (N <= 7) execute more efficient code */

                 for (i = 0; i < SrcLength/8; i++) {

                         __m64 mm3, mm4;

                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */

                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */

                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                         mSrc1++;

                         mDest++;

                 }

         } else {

                 for (i = 0; i < SrcLength/8; i++) {

                         __m64 mm3, mm4, mm5, mm6;

                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */

                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */

                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */

                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */

                         /* Take abs value of the signed words */

                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */

                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */

                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */

                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */

                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */

                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */

                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                         mSrc1++;

                         mDest++;

                 }

         }

         _m_empty();                                             /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)

 {

         unsigned int i, istart;

         unsigned char *cursrc1, *curdest;

         int result;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if (N > 8) {

                 return (-1);

         }


         /* Special case: N==0 */

         if (N == 0) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 result = (int) *cursrc1 << N;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         /* ** Duplicate T in 8 bytes of MM3 ** */

                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */

                         pcmpeqb mm2, mm2        /* generate all 1's in mm2 */

                         mov al, T       /* load T into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm3, eax           /* copy EAX into MM3 */

                         movd mm4, eax           /* copy EAX into MM4 */

                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with T */

                         psubusb mm2, mm3        /* store 0xFF - T in MM2 */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1029:

                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */

                 paddusb mm0, mm2        /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */

                         pcmpeqb mm0, mm1        /* binarize 255:0, comparing to 255 */

                         movq [edi], mm0         /* store result in SrcDest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1029           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         /* Duplicate T in 8 bytes of MM3 */

         __m64 mm1 = _m_pcmpeqb(mm1, mm1);                       /* generate all 1's in mm1 */

         __m64 mm2 = _m_pcmpeqb(mm2, mm2);                       /* generate all 1's in mm1 */

         int i;

         memset(&i, T, 4);

         __m64 mm3 = _m_from_int(i);

         __m64 mm4 = _m_from_int(i);

         mm3 = _m_punpckldq(mm3, mm4);                   /* fill higher bytes of MM3 with T */

         mm2 = _m_psubusb(mm2, mm3);                     /* store 0xFF - T in MM2 */

         //__m64 mm3 = _m_from_int64(lli); // x86_64 only

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm0 = _m_paddusb(*mSrc1, mm2);    /* Src1+(0xFF-T) (add 8 bytes with saturation) */

                 *mDest = _m_pcmpeqb(mm0, mm1);          /* binarize 255:0, comparing to 255 */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)

 {

         unsigned int i, istart;

         unsigned char *cursrc1;

         unsigned char *curdest;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: T==0 */

         if (T == 0) {

                 memset(Dest, 255, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,

                                                                   unsigned char Tmax)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */

                         /* ** Duplicate Tmax in 8 bytes of MM3 ** */

                         mov al, Tmax    /* load Tmax into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm3, eax           /* copy EAX into MM3 */

                         movd mm4, eax           /* copy EAX into MM4 */

                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with Tmax */

                         psubusb mm1, mm3        /* store 0xFF - Tmax in MM1 */

                         /* ** Duplicate Tmin in 8 bytes of MM5 ** */

                         mov al, Tmin    /* load Tmin into AL */

                         mov ah, al      /* copy AL into AH */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm5, eax           /* copy EAX into MM5 */

                         movd mm4, eax           /* copy EAX into MM4 */

                         punpckldq mm5, mm4      /* fill higher bytes of MM5 with Tmin */

                         movq mm7, mm5           /* copy MM5 into MM7 */

                         paddusb mm7, mm1        /* store 0xFF - Tmax + Tmin in MM7 */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1030:

                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */

                 paddusb mm0, mm1        /* MM0=SrcDest+(0xFF-Tmax) */

                         psubusb mm0, mm7        /* MM0=MM0-(0xFF-Tmax+Tmin) */

                         paddusb mm0, mm5        /* MM0=MM0+Tmin */

                         movq [edi], mm0         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1030           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm1 = _m_pcmpeqb(mm1, mm1);       /* generate all 1's in mm1 */

         int i;

         /* Duplicate Tmax in 8 bytes of MM3 */

         __m64 mm3, mm4;

         memset(&i, Tmax, 4);

         mm3 = _m_from_int(i);

         mm4 = _m_from_int(i);

         mm3 = _m_punpckldq(mm3, mm4);           /* fill higher bytes of MM3 with Tmax */

         mm1 = _m_psubusb(mm1, mm3);             /* store 0xFF - Tmax in MM1 */

         //__m64 mm3 = _m_from_int64(lli); // x86_64 only

         /* Duplicate Tmax in 8 bytes of MM3 */

         __m64 mm5, mm7;

         memset(&i, Tmin, 4);

         mm5 = _m_from_int(i);

         mm4 = _m_from_int(i);

         mm5 = _m_punpckldq(mm5, mm4);           /* fill higher bytes of MM5 with Tmin */

         mm7 = _m_paddusb(mm5, mm1);     /* store 0xFF - Tmax + Tmin in MM7 */

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm0;

                 mm0 = _m_paddusb(*mSrc1, mm1);  /* MM0=Src1+(0xFF-Tmax) */

                 mm0 = _m_psubusb(mm0, mm7);     /* MM0=MM0-(0xFF-Tmax+Tmin) */

                 *mDest = _m_paddusb(mm0, mm5);  /* MM0+Tmin */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                             /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,

                                                            unsigned char Tmax)

 {

         unsigned int i, istart;

         unsigned char *cursrc1;

         unsigned char *curdest;


         /* Validate input parameters */

         if ((Src1 == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         /* Special case: Tmin==0 && Tmax = 255 */

         if ((Tmin == 0) && (Tmax == 25)) {

                 memcpy(Src1, Dest, length);

                 return (0);

         }


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc1 = &Src1[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc1 = Src1;

                 curdest = Dest;

         }


         /* C routine to process image */

         for (i = istart; i < length; i++) {

                 if (*cursrc1 < Tmin) {

                         *curdest = Tmin;

                 } else if (*cursrc1 > Tmax) {

                         *curdest = Tmax;

                 } else {

                         *curdest = *cursrc1;

                 }

                 /* Advance pointers */

                 cursrc1++;

                 curdest++;

         }


         return (0);

 }


 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,

                                                                           int Nmin, int Nmax)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {

                 pusha

                         mov ax, WORD PTR Nmax           /* load Nmax in AX */

                         mov bx, WORD PTR Cmax           /* load Cmax in BX */

                         sub ax, WORD PTR Nmin           /* AX = Nmax - Nmin */

                         sub bx, WORD PTR Cmin           /* BX = Cmax - Cmin */

                         jz             L10311           /* check division by zero */

                         xor dx, dx      /* prepare for division, zero DX */

                         div               bx            /* AX = AX/BX */

                         jmp            L10312

 L10311:

                 mov ax, 255     /* if div by zero, assume result max byte value */

 L10312:                         /* ** Duplicate AX in 4 words of MM0 ** */

                 mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm0, eax           /* copy EAX into MM0 */

                         movd mm1, eax           /* copy EAX into MM1 */

                         punpckldq mm0, mm1      /* fill higher words of MM0 with AX */

                         /* ** Duplicate Cmin in 4 words of MM1 ** */

                         mov ax, WORD PTR Cmin           /* load Cmin into AX */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm1, eax           /* copy EAX into MM1 */

                         movd mm2, eax           /* copy EAX into MM2 */

                         punpckldq mm1, mm2      /* fill higher words of MM1 with Cmin */

                         /* ** Duplicate Nmin in 4 words of MM2 ** */

                         mov ax, WORD PTR Nmin           /* load Nmin into AX */

                         mov bx, ax      /* copy AX into BX */

                         shl eax, 16     /* shift 2 bytes of EAX left */

                         mov ax, bx      /* copy BX into AX */

                         movd mm2, eax           /* copy EAX into MM2 */

                         movd mm3, eax           /* copy EAX into MM3 */

                         punpckldq mm2, mm3      /* fill higher words of MM2 with Nmin */

                         pxor mm7, mm7           /* zero MM7 register */

                         mov eax, Src1           /* load Src1 address into eax */

                         mov edi, Dest           /* load Dest address into edi */

                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */

                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */

                         align 16                        /* 16 byte alignment of the loop entry */

 L1031:

                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */

                 movq mm4, mm3           /* copy MM3 into MM4  */

                         punpcklbw mm3, mm7      /* unpack low  bytes of SrcDest into words */

                         punpckhbw mm4, mm7      /* unpack high bytes of SrcDest into words */

                         psubusb mm3, mm1        /* S-Cmin, low  bytes */

                         psubusb mm4, mm1        /* S-Cmin, high bytes */

                         pmullw mm3, mm0         /* MM0*(S-Cmin), low  bytes */

                         pmullw mm4, mm0         /* MM0*(S-Cmin), high bytes */

                         paddusb mm3, mm2        /* MM0*(S-Cmin)+Nmin, low  bytes */

                         paddusb mm4, mm2        /* MM0*(S-Cmin)+Nmin, high bytes */

                         /* ** Take abs value of the signed words ** */

                         movq mm5, mm3           /* copy mm3 into mm5 */

                         movq mm6, mm4           /* copy mm4 into mm6 */

                         psraw mm5, 15           /* fill mm5 words with word sign bit */

                         psraw mm6, 15           /* fill mm6 words with word sign bit */

                         pxor mm3, mm5           /* take 1's compliment of only neg words */

                         pxor mm4, mm6           /* take 1's compliment of only neg words */

                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */

                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */

                         packuswb mm3, mm4       /* pack words back into bytes with saturation */

                         movq [edi], mm3         /* store result in Dest */

                         add eax, 8      /* increase Src1 register pointer by 8 */

                         add edi, 8      /* increase Dest register pointer by 8 */

                         dec              ecx            /* decrease loop counter */

                         jnz             L1031           /* check loop termination, proceed if required */

                         emms                            /* exit MMX state */

                         popa

         }

 #else

         /* i386 and x86_64 */

         __m64 *mSrc1 = (__m64*)Src1;

         __m64 *mDest = (__m64*)Dest;

         __m64 mm0, mm1, mm2, mm3;


         int i;

         /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */

         unsigned short a = Nmax - Nmin;

         unsigned short b = Cmax - Cmin;

         if (b == 0) {

             a = 255;

         } else {

             a /= b;

         }

         i = (a<<16)|a;

         mm0 = _m_from_int(i);

         mm1 = _m_from_int(i);

         mm0 = _m_punpckldq(mm0, mm1);                   /* fill higher words of MM0 with AX */

         /* Duplicate Cmin in 4 words of MM1 */

         i = (Cmin<<16)|(short)Cmin;

         mm1 = _m_from_int(i);

         mm2 = _m_from_int(i);

         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with Cmin */

         /* Duplicate Nmin in 4 words of MM2 */

         i = (Nmin<<16)|(short)Nmin;

         mm2 = _m_from_int(i);

         mm3 = _m_from_int(i);

         mm2 = _m_punpckldq(mm2, mm3);                   /* fill higher words of MM2 with Nmin */

         __m64 mm7 = _m_from_int(0);                     /* zero mm0 register */

         for (i = 0; i < SrcLength/8; i++) {

                 __m64 mm3, mm4, mm5, mm6;

                 mm3 = _m_punpcklbw(*mSrc1, mm7);        /* unpack low  bytes of Src1 into words */

                 mm4 = _m_punpckhbw(*mSrc1, mm7);        /* unpack high bytes of Src1 into words */

                 mm3 = _m_psubusb(mm3, mm1);             /* S-Cmin, low  bytes */

                 mm4 = _m_psubusb(mm4, mm1);             /* S-Cmin, high bytes */

                 mm3 = _m_pmullw(mm3, mm0);              /* MM0*(S-Cmin), low  bytes */

                 mm4 = _m_pmullw(mm4, mm0);              /* MM0*(S-Cmin), high bytes */

                 mm3 = _m_paddusb(mm3, mm2);             /* MM0*(S-Cmin)+Nmin, low  bytes */

                 mm4 = _m_paddusb(mm4, mm2);             /* MM0*(S-Cmin)+Nmin, high bytes */

                 /* Take abs value of the signed words */

                 mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */

                 mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */

                 mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */

                 mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */

                 mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */

                 mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */

                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */

                 mSrc1++;

                 mDest++;

         }

         _m_empty();                                     /* clean MMX state */

 #endif

         return (0);

 #else

         return (-1);

 #endif

 }


 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,

                                                                    int Nmax)

 {

         unsigned int i, istart;

         unsigned char *cursrc;

         unsigned char *curdest;

         int dN, dC, factor;

         int result;


         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL))

                 return(-1);

         if (length == 0)

                 return(0);


         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {


                 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);


                 /* Check for unaligned bytes */

                 if ((length & 7) > 0) {

                         /* Setup to process unaligned bytes */

                         istart = length & 0xfffffff8;

                         cursrc = &Src[istart];

                         curdest = &Dest[istart];

                 } else {

                         /* No unaligned bytes - we are done */

                         return (0);

                 }

         } else {

                 /* Setup to process whole image */

                 istart = 0;

                 cursrc = Src;

                 curdest = Dest;

         }


         /* C routine to process image */

         dC = Cmax - Cmin;

         if (dC == 0)

                 return (0);

         dN = Nmax - Nmin;

         factor = dN / dC;

         for (i = istart; i < length; i++) {

                 result = factor * ((int) (*cursrc) - Cmin) + Nmin;

                 if (result > 255)

                         result = 255;

                 *curdest = (unsigned char) result;

                 /* Advance pointers */

                 cursrc++;

                 curdest++;

         }


         return (0);

 }


 /* ------------------------------------------------------------------------------------ */


 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                    signed short *Kernel, unsigned char Divisor)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 3) || (rows < 3) || (Divisor == 0))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, Divisor         /* load Divisor into BL */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */

                         add edx, 8      /* second row              |K0 K1 K2 0| */

                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */

                         add edx, 8      /* third row               |K6 K7 K8 0| */

                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */

                         /* ---, */

                         mov eax, columns        /* load columns into EAX */

                                 mov esi, Src    /* ESI = Src row 0 address */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, eax    /* EDI = EDI + columns */

                                 inc              edi            /* 1 byte offset from the left edge */

                                 mov edx, rows           /* initialize ROWS counter */

                                 sub edx, 2      /* do not use first and last row */

                                 /* ---, */

 L10320:

                         mov ecx, eax    /* initialize COLUMS counter */

                                 sub ecx, 2      /* do not use first and last column */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10322:

                         /* ---, */

                         movq mm1, [esi]         /* load 8 bytes of the image first row */

                         add esi, eax    /* move one row below */

                                 movq mm2, [esi]         /* load 8 bytes of the image second row */

                         add esi, eax    /* move one row below */

                                 movq mm3, [esi]         /* load 8 bytes of the image third row */

                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */

                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */

                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */

                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */

                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */

                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */

                                 paddsw mm1, mm3         /* add 4 words of the third row and result */

                                 movq mm2, mm1           /* copy MM1 into MM2 */

                                 psrlq mm1, 32           /* shift 2 left words to the right */

                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */

                                 movq mm3, mm1           /* copy MM1 into MM3 */

                                 psrlq mm1, 16           /* shift 1 left word to the right */

                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */

                                 /* --, */

                                 movd mm2, eax           /* save EAX in MM2 */

                                 movd mm3, edx           /* save EDX in MM3 */

                                 movd eax, mm1           /* copy MM1 into EAX */

                                 psraw mm1, 15           /* spread sign bit of the result */

                                 movd edx, mm1           /* fill EDX with a sign bit */

                                 idiv bx         /* IDIV - VERY EXPENSIVE */

                                 movd mm1, eax           /* move result of division into MM1 */

                                 packuswb mm1, mm0       /* pack division result with saturation */

                                 movd eax, mm1           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd edx, mm3           /* restore saved EDX */

                                 movd eax, mm2           /* restore saved EAX */

                                 /* --, */

                                 sub esi, eax    /* move two rows up */

                                 sub esi, eax    /* */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10322           /* check loop termination, proceed if required */

                                 add esi, 2      /* move to the next row in Src */

                                 add edi, 2      /* move to the next row in Dest */

                                 dec              edx            /* decrease loop counter ROWS */

                                 jnz            L10320           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */

                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */

                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */

                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */

                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */

                         /* --- */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */

                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */

                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */

                         "sub          $2, %%edx \n\t"   /* do not use first and last row */

                         /* --- */

                         ".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */

                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10322:               \n\t"

                         /* --- */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */

                         "add       %%eax, %%esi \n\t"   /* move one row below */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */

                         "add       %%eax, %%esi \n\t"   /* move one row below */

                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */

                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */

                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */

                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */

                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */

                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */

                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */

                         /* -- */

                         "movd      %%eax, %%mm2 \n\t"   /* save EAX in MM2 */

                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */

                         "movd      %%mm1, %%eax \n\t"   /* copy MM1 into EAX */

                         "psraw       $15, %%mm1 \n\t"   /* spread sign bit of the result */

                         "movd      %%mm1, %%edx \n\t"   /* fill EDX with a sign bit */

                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */

                         "movd      %%eax, %%mm1 \n\t"   /* move result of division into MM1 */

                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack division result with saturation */

                         "movd      %%mm1, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */

                         "movd      %%mm2, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "sub       %%eax, %%esi \n\t"   /* move two rows up */

                         "sub       %%eax, %%esi \n\t"   /* */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10322 \n\t"   /* check loop termination, proceed if required */

                         "add          $2, %%esi \n\t"   /* move to the next row in Src */

                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10320 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(Divisor)            /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                    signed short *Kernel, unsigned char Divisor)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 5) || (rows < 5) || (Divisor == 0))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, Divisor         /* load Divisor into BL */

                                 movd mm5, ebx           /* copy Divisor into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 2      /* 2 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 shl eax, 1      /* EAX = columns * 2 */

                                 add edi, eax    /* 2 row offset from the top edge */

                                 shr eax, 1      /* EAX = columns */

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 4      /* do not use first 2 and last 2 rows */

                                 /* ---, */

 L10330:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 4      /* do not use first 2 and last 2 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10332:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 /* ---, */

                                 movd mm1, eax           /* save EDX in MM1 */

                                 movd mm2, ebx           /* save EDX in MM2 */

                                 movd mm3, edx           /* save EDX in MM3 */

                                 movd eax, mm7           /* load summation result into EAX */

                                 psraw mm7, 15           /* spread sign bit of the result */

                                 movd ebx, mm5           /* load Divisor into EBX */

                                 movd edx, mm7           /* fill EDX with a sign bit */

                                 idiv bx         /* IDIV - VERY EXPENSIVE */

                                 movd mm7, eax           /* move result of division into MM7 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd edx, mm3           /* restore saved EDX */

                                 movd ebx, mm2           /* restore saved EBX */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 72     /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10332           /* check loop termination, proceed if required */

                                 add esi, 4      /* move to the next row in Src */

                                 add edi, 4      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10330           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */

                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */

                         "shr          $1, %%eax \n\t"   /* EAX = columns */

                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */

                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */

                         /* --- */

                         ".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         /* --- */

                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */

                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */

                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */

                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */

                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */

                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */

                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */

                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */

                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */

                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10332 \n\t"   /* check loop termination, proceed if required */

                         "add          $4, %%esi \n\t"   /* move to the next row in Src */

                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10330 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(Divisor)            /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                    signed short *Kernel, unsigned char Divisor)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 7) || (rows < 7) || (Divisor == 0))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, Divisor         /* load Divisor into BL */

                                 movd mm5, ebx           /* copy Divisor into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 3      /* 3 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 add edi, eax    /* 3 row offset from the top edge */

                                 add edi, eax

                                 add edi, eax

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 6      /* do not use first 3 and last 3 rows */

                                 /* ---, */

 L10340:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 6      /* do not use first 3 and last 3 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10342:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 6 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 /* ---, */

                                 movd mm1, eax           /* save EDX in MM1 */

                                 movd mm2, ebx           /* save EDX in MM2 */

                                 movd mm3, edx           /* save EDX in MM3 */

                                 movd eax, mm7           /* load summation result into EAX */

                                 psraw mm7, 15           /* spread sign bit of the result */

                                 movd ebx, mm5           /* load Divisor into EBX */

                                 movd edx, mm7           /* fill EDX with a sign bit */

                                 idiv bx         /* IDIV - VERY EXPENSIVE */

                                 movd mm7, eax           /* move result of division into MM7 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd edx, mm3           /* restore saved EDX */

                                 movd ebx, mm2           /* restore saved EBX */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 104    /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10342           /* check loop termination, proceed if required */

                                 add esi, 6      /* move to the next row in Src */

                                 add edi, 6      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10340           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */

                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */

                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */

                         /* --- */

                         ".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 6 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         /* --- */

                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */

                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */

                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */

                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */

                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */

                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */

                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */

                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */

                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */

                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10342 \n\t"   /* check loop termination, proceed if required */

                         "add          $6, %%esi \n\t"   /* move to the next row in Src */

                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10340 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(Divisor)            /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                    signed short *Kernel, unsigned char Divisor)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 9) || (rows < 9) || (Divisor == 0))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, Divisor         /* load Divisor into BL */

                                 movd mm5, ebx           /* copy Divisor into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 4      /* 4 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 add edi, eax    /* 4 row offset from the top edge */

                                 add edi, eax

                                 add edi, eax

                                 add edi, eax

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 8      /* do not use first 4 and last 4 rows */

                                 /* ---, */

 L10350:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 8      /* do not use first 4 and last 4 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10352:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 6 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 8 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 9 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm3, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 /* ---, */

                                 movd mm1, eax           /* save EDX in MM1 */

                                 movd mm2, ebx           /* save EDX in MM2 */

                                 movd mm3, edx           /* save EDX in MM3 */

                                 movd eax, mm7           /* load summation result into EAX */

                                 psraw mm7, 15           /* spread sign bit of the result */

                                 movd ebx, mm5           /* load Divisor into EBX */

                                 movd edx, mm7           /* fill EDX with a sign bit */

                                 idiv bx         /* IDIV - VERY EXPENSIVE */

                                 movd mm7, eax           /* move result of division into MM7 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd edx, mm3           /* restore saved EDX */

                                 movd ebx, mm2           /* restore saved EBX */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 208    /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10352           /* check loop termination, proceed if required */

                                 add esi, 8      /* move to the next row in Src */

                                 add edi, 8      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10350           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */

                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */

                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */

                         /* --- */

                         ".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 6 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 8 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 9 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         /* --- */

                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */

                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */

                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */

                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */

                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */

                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */

                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */

                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */

                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */

                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10352 \n\t"   /* check loop termination, proceed if required */

                         "add          $8, %%esi \n\t"   /* move to the next row in Src */

                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10350 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(Divisor)            /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                            signed short *Kernel, unsigned char NRightShift)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 3) || (rows < 3) || (NRightShift > 7))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, NRightShift     /* load NRightShift into BL */

                                 movd mm4, ebx           /* copy NRightShift into MM4 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */

                         add edx, 8      /* second row              |K0 K1 K2 0| */

                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */

                         add edx, 8      /* third row               |K6 K7 K8 0| */

                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */

                         /* ---, */

                         mov eax, columns        /* load columns into EAX */

                                 mov esi, Src    /* ESI = Src row 0 address */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, eax    /* EDI = EDI + columns */

                                 inc              edi            /* 1 byte offset from the left edge */

                                 mov edx, rows           /* initialize ROWS counter */

                                 sub edx, 2      /* do not use first and last row */

                                 /* ---, */

 L10360:

                         mov ecx, eax    /* initialize COLUMS counter */

                                 sub ecx, 2      /* do not use first and last column */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10362:

                         /* ---, */

                         movq mm1, [esi]         /* load 8 bytes of the image first row */

                         add esi, eax    /* move one row below */

                                 movq mm2, [esi]         /* load 8 bytes of the image second row */

                         add esi, eax    /* move one row below */

                                 movq mm3, [esi]         /* load 8 bytes of the image third row */

                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */

                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */

                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */

                                 psrlw mm1, mm4          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm4          /* shift right each pixel NshiftRight times */

                                 psrlw mm3, mm4          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */

                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */

                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */

                                 paddsw mm1, mm3         /* add 4 words of the third row and result */

                                 movq mm2, mm1           /* copy MM1 into MM2 */

                                 psrlq mm1, 32           /* shift 2 left words to the right */

                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */

                                 movq mm3, mm1           /* copy MM1 into MM3 */

                                 psrlq mm1, 16           /* shift 1 left word to the right */

                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */

                                 packuswb mm1, mm0       /* pack shift result with saturation */

                                 movd ebx, mm1           /* copy saturated result into EBX */

                                 mov [edi], bl           /* copy a byte result into Dest */

                                 /* --, */

                                 sub esi, eax    /* move two rows up */

                                 sub esi, eax

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10362           /* check loop termination, proceed if required */

                                 add esi, 2      /* move to the next row in Src */

                                 add edi, 2      /* move to the next row in Dest */

                                 dec              edx            /* decrease loop counter ROWS */

                                 jnz            L10360           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */

                         "movd      %%ebx, %%mm4 \n\t"   /* copy NRightShift into MM4 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */

                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */

                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */

                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */

                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */

                         /* --- */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */

                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */

                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */

                         "sub          $2, %%edx \n\t"   /* do not use first and last row */

                         /* --- */

                         ".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */

                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10362:               \n\t"

                         /* --- */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */

                         "add       %%eax, %%esi \n\t"   /* move one row below */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */

                         "add       %%eax, %%esi \n\t"   /* move one row below */

                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */

                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */

                         "psrlw     %%mm4, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm4, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm4, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */

                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */

                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */

                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */

                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */

                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack shift result with saturation */

                         "movd      %%mm1, %%ebx \n\t"   /* copy saturated result into EBX */

                         "mov      %%bl, (%%edi) \n\t"   /* copy a byte result into Dest */

                         /* -- */

                         "sub       %%eax, %%esi \n\t"   /* move two rows up */

                         "sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"     /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10362 \n\t"   /* check loop termination, proceed if required */

                         "add          $2, %%esi \n\t"   /* move to the next row in Src */

                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10360 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(NRightShift)        /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                            signed short *Kernel, unsigned char NRightShift)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 5) || (rows < 5) || (NRightShift > 7))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, NRightShift     /* load NRightShift into BL */

                                 movd mm5, ebx           /* copy NRightShift into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 2      /* 2 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 shl eax, 1      /* EAX = columns * 2 */

                                 add edi, eax    /* 2 row offset from the top edge */

                                 shr eax, 1      /* EAX = columns */

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 4      /* do not use first 2 and last 2 rows */

                                 /* ---, */

 L10370:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 4      /* do not use first 2 and last 2 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10372:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 movd mm1, eax           /* save EAX in MM1 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 72     /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10372           /* check loop termination, proceed if required */

                                 add esi, 4      /* move to the next row in Src */

                                 add edi, 4      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10370           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */

                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */

                         "shr          $1, %%eax \n\t"   /* EAX = columns */

                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */

                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */

                         /* --- */

                         ".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10372 \n\t"   /* check loop termination, proceed if required */

                         "add          $4, %%esi \n\t"   /* move to the next row in Src */

                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10370 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(NRightShift)        /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                            signed short *Kernel, unsigned char NRightShift)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 7) || (rows < 7) || (NRightShift > 7))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, NRightShift     /* load NRightShift into BL */

                                 movd mm5, ebx           /* copy NRightShift into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 3      /* 3 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 add edi, eax    /* 3 row offset from the top edge */

                                 add edi, eax

                                 add edi, eax

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 6      /* do not use first 3 and last 3 rows */

                                 /* ---, */

 L10380:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 6      /* do not use first 3 and last 3 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10382:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 6 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 movd mm1, eax           /* save EAX in MM1 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 104    /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10382           /* check loop termination, proceed if required */

                                 add esi, 6      /* move to the next row in Src */

                                 add edi, 6      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10380           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */

                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */

                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */

                         /* --- */

                         ".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 6 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10382 \n\t"   /* check loop termination, proceed if required */

                         "add          $6, %%esi \n\t"   /* move to the next row in Src */

                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10380 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(NRightShift)        /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                                            signed short *Kernel, unsigned char NRightShift)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))

                 return(-1);


         if ((columns < 9) || (rows < 9) || (NRightShift > 7))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, NRightShift     /* load NRightShift into BL */

                                 movd mm5, ebx           /* copy NRightShift into MM5 */

                                 mov edx, Kernel         /* load Kernel address into EDX */

                                 mov esi, Src    /* load Src  address to ESI */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, 4      /* 4 column offset from the left edge */

                                 mov eax, columns        /* load columns into EAX */

                                 add edi, eax    /* 4 row offset from the top edge */

                                 add edi, eax

                                 add edi, eax

                                 add edi, eax

                                 mov ebx, rows           /* initialize ROWS counter */

                                 sub ebx, 8      /* do not use first 4 and last 4 rows */

                                 /* ---, */

 L10390:

                         mov ecx, eax    /* initialize COLUMNS counter */

                                 sub ecx, 8      /* do not use first 4 and last 4 columns */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10392:

                         pxor mm7, mm7           /* zero MM7 (accumulator) */

                                 movd mm6, esi           /* save ESI in MM6 */

                                 /* --- 1 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 2 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 3 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 4 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 5 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 6 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 8 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         dec              esi

                                 add esi, eax    /* move Src pointer 1 row below */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* --- 9 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm2, mm1           /* copy MM1 into MM2 */

                                 inc              esi            /* move pointer to the next 8 bytes of Src */

                                 movq mm3, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 movq mm4, [edx]         /* load 4 words of Kernel */

                         add edx, 8      /* move pointer to other 4 words */

                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */

                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 movq mm1, [esi]         /* load 8 bytes of the Src */

                         movq mm3, [edx]         /* load 4 words of Kernel */

                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */

                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */

                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */

                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */

                                 /* ---, */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */

                                 movq mm2, mm7           /* copy MM7 into MM2 */

                                 psrlq mm7, 16           /* shift 1 left word to the right */

                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */

                                 movd mm1, eax           /* save EAX in MM1 */

                                 packuswb mm7, mm0       /* pack division result with saturation */

                                 movd eax, mm7           /* copy saturated result into EAX */

                                 mov [edi], al           /* copy a byte result into Dest */

                                 movd eax, mm1           /* restore saved EAX */

                                 /* --, */

                                 movd esi, mm6           /* move Src pointer to the top pixel */

                                 sub edx, 208    /* EDX = Kernel address */

                                 inc              esi            /* move Src  pointer to the next pixel */

                                 inc              edi            /* move Dest pointer to the next pixel */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10392           /* check loop termination, proceed if required */

                                 add esi, 8      /* move to the next row in Src */

                                 add edi, 8      /* move to the next row in Dest */

                                 dec              ebx            /* decrease loop counter ROWS */

                                 jnz            L10390           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */

                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */

                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */

                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */

                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */

                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */

                         /* --- */

                         ".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */

                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */

                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */

                         /* --- 1 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 2 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 3 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 4 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 5 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 6 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 8 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- 9 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */

                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */

                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */

                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */

                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */

                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */

                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */

                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */

                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */

                         /* --- */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */

                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */

                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */

                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */

                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */

                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */

                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */

                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */

                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */

                         /* -- */

                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */

                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */

                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */

                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10392 \n\t"   /* check loop termination, proceed if required */

                         "add          $8, %%esi \n\t"   /* move to the next row in Src */

                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10390 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(Kernel),            /* %4 */

                         "m"(NRightShift)        /* %5 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 /* ------------------------------------------------------------------------------------ */


 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL))

                 return(-1);


         if ((columns < 8) || (rows < 3))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 mov eax, columns        /* load columns into EAX */

                                 /* ---, */

                                 mov esi, Src    /* ESI = Src row 0 address */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, eax    /* EDI = EDI + columns */

                                 inc              edi            /* 1 byte offset from the left edge */

                                 mov edx, rows           /* initialize ROWS counter */

                                 sub edx, 2      /* do not use first and last rows */

                                 /* ---, */

 L10400:

                         mov ecx, eax    /* initialize COLUMS counter */

                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */

                                 mov ebx, esi    /* save ESI in EBX */

                                 movd mm1, edi           /* save EDI in MM1 */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10402:

                         /* ---, */

                         movq mm4, [esi]         /* load 8 bytes from Src */

                         movq mm5, mm4           /* save MM4 in MM5 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */

                                 movq mm6, [esi]         /* load 8 bytes from Src */

                         movq mm7, mm6           /* save MM6 in MM7 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */

                                 add esi, eax    /* move to the next row of Src */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 add esi, eax    /* move to the next row of Src */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 /* ---, */

                                 movq mm2, mm4           /* copy MM4 into MM2 */

                                 psrlq mm4, 32           /* shift 2 left words to the right */

                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */

                                 movq mm3, mm6           /* copy MM6 into MM3 */

                                 psrlq mm6, 32           /* shift 2 left words to the right */

                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */

                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */

                                 movq mm2, mm5           /* copy MM6 into MM2 */

                                 psrlq mm5, 32           /* shift 2 left words to the right */

                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */

                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */

                                 /* Take abs values of MM4 and MM5 */

                                 movq mm6, mm4           /* copy MM4 into MM6 */

                                 movq mm7, mm5           /* copy MM5 into MM7 */

                                 psraw mm6, 15           /* fill MM6 words with word sign bit */

                                 psraw mm7, 15           /* fill MM7 words with word sign bit */

                                 pxor mm4, mm6           /* take 1's compliment of only neg words */

                                 pxor mm5, mm7           /* take 1's compliment of only neg words */

                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */

                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */

                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */

                                 movq [edi], mm4         /* store result in Dest */

                                 /* ---, */

                                 sub esi, eax    /* move to the current top row in Src */

                                 sub esi, eax

                                 add esi, 8      /* move Src  pointer to the next 8 pixels */

                                 add edi, 8      /* move Dest pointer to the next 8 pixels */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10402           /* check loop termination, proceed if required */

                                 mov esi, ebx    /* restore most left current row Src  address */

                                 movd edi, mm1           /* restore most left current row Dest address */

                                 add esi, eax    /* move to the next row in Src */

                                 add edi, eax    /* move to the next row in Dest */

                                 dec              edx            /* decrease loop counter ROWS */

                                 jnz            L10400           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         /* --- */

                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */

                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */

                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */

                         "sub          $2, %%edx \n\t"   /* do not use first and last rows */

                         /* --- */

                         ".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */

                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */

                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */

                         "movd      %%edi, %%mm1 \n\t"   /* save EDI in MM1 */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10402:               \n\t"

                         /* --- */

                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */

                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */

                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         /* --- */

                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */

                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */

                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */

                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */

                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */

                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */

                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */

                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */

                         /* Take abs values of MM4 and MM5 */

                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */

                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */

                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */

                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */

                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */

                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */

                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */

                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */

                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */

                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */

                         /* --- */

                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */

                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */

                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10402 \n\t"   /* check loop termination, proceed if required */

                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */

                         "movd      %%mm1, %%edi \n\t"   /* restore most left current row Dest address */

                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */

                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */

                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10400 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns)            /* %3 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,

                                                                         unsigned char NRightShift)

 {

         /* Validate input parameters */

         if ((Src == NULL) || (Dest == NULL))

                 return(-1);

         if ((columns < 8) || (rows < 3) || (NRightShift > 7))

                 return (-1);


         if ((SDL_imageFilterMMXdetect())) {

 //#ifdef USE_MMX

 #if defined(USE_MMX) && defined(i386)

 #if !defined(GCC__)

                 __asm

                 {

                         pusha

                                 pxor mm0, mm0           /* zero MM0 */

                                 mov eax, columns        /* load columns into EAX */

                                 xor ebx, ebx    /* zero EBX */

                                 mov bl, NRightShift     /* load NRightShift into BL */

                                 movd mm1, ebx           /* copy NRightShift into MM1 */

                                 /* ---, */

                                 mov esi, Src    /* ESI = Src row 0 address */

                                 mov edi, Dest           /* load Dest address to EDI */

                                 add edi, eax    /* EDI = EDI + columns */

                                 inc              edi            /* 1 byte offset from the left edge */

                                 /* initialize ROWS counter */

                                 sub rows, 2     /* do not use first and last rows */

                                 /* ---, */

 L10410:

                         mov ecx, eax    /* initialize COLUMS counter */

                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */

                                 mov ebx, esi    /* save ESI in EBX */

                                 mov edx, edi    /* save EDI in EDX */

                                 align 16                        /* 16 byte alignment of the loop entry */

 L10412:

                         /* ---, */

                         movq mm4, [esi]         /* load 8 bytes from Src */

                         movq mm5, mm4           /* save MM4 in MM5 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm4, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm5, mm1          /* shift right each pixel NshiftRight times */

                                 movq mm6, [esi]         /* load 8 bytes from Src */

                         movq mm7, mm6           /* save MM6 in MM7 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm6, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm7, mm1          /* shift right each pixel NshiftRight times */

                                 add esi, eax    /* move to the next row of Src */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 add esi, eax    /* move to the next row of Src */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 add esi, 2      /* move ESI pointer 2 bytes right */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */

                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */

                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */

                                 movq mm2, [esi]         /* load 8 bytes from Src */

                         movq mm3, mm2           /* save MM2 in MM3 */

                                 sub esi, 2      /* move ESI pointer back 2 bytes left */

                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */

                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */

                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */

                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */

                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */

                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */

                                 /* ---, */

                                 movq mm2, mm4           /* copy MM4 into MM2 */

                                 psrlq mm4, 32           /* shift 2 left words to the right */

                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */

                                 movq mm3, mm6           /* copy MM6 into MM3 */

                                 psrlq mm6, 32           /* shift 2 left words to the right */

                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */

                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */

                                 movq mm2, mm5           /* copy MM6 into MM2 */

                                 psrlq mm5, 32           /* shift 2 left words to the right */

                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */

                                 movq mm3, mm7           /* copy MM7 into MM3 */

                                 psrlq mm7, 32           /* shift 2 left words to the right */

                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */

                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */

                                 /* Take abs values of MM4 and MM5 */

                                 movq mm6, mm4           /* copy MM4 into MM6 */

                                 movq mm7, mm5           /* copy MM5 into MM7 */

                                 psraw mm6, 15           /* fill MM6 words with word sign bit */

                                 psraw mm7, 15           /* fill MM7 words with word sign bit */

                                 pxor mm4, mm6           /* take 1's compliment of only neg words */

                                 pxor mm5, mm7           /* take 1's compliment of only neg words */

                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */

                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */

                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */

                                 movq [edi], mm4         /* store result in Dest */

                                 /* ---, */

                                 sub esi, eax    /* move to the current top row in Src */

                                 sub esi, eax

                                 add esi, 8      /* move Src  pointer to the next 8 pixels */

                                 add edi, 8      /* move Dest pointer to the next 8 pixels */

                                 /* ---, */

                                 dec              ecx            /* decrease loop counter COLUMNS */

                                 jnz            L10412           /* check loop termination, proceed if required */

                                 mov esi, ebx    /* restore most left current row Src  address */

                                 mov edi, edx    /* restore most left current row Dest address */

                                 add esi, eax    /* move to the next row in Src */

                                 add edi, eax    /* move to the next row in Dest */

                                 dec rows        /* decrease loop counter ROWS */

                                 jnz            L10410           /* check loop termination, proceed if required */

                                 /* ---, */

                                 emms                            /* exit MMX state */

                                 popa

                 }

 #else

                 asm volatile

                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */

                         "mov          %3, %%eax \n\t"   /* load columns into EAX */

                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */

                         "mov           %4, %%bl \n\t"   /* load NRightShift into BL */

                         "movd      %%ebx, %%mm1 \n\t"   /* copy NRightShift into MM1 */

                         /* --- */

                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */

                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */

                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */

                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */

                         /* initialize ROWS counter */

                         "subl            $2, %2 \n\t"   /* do not use first and last rows */

                         /* --- */

                         ".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */

                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */

                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */

                         "mov       %%edi, %%edx \n\t"   /* save EDI in EDX */

                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */

                         ".L10412:               \n\t"

                         /* --- */

                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm4 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm5 \n\t"   /* shift right each pixel NshiftRight times */

                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm6 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm7 \n\t"   /* shift right each pixel NshiftRight times */

                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */

                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */

                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */

                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */

                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */

                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */

                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */

                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */

                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */

                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */

                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */

                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */

                         /* --- */

                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */

                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */

                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */

                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */

                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */

                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */

                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */

                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */

                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */

                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */

                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */

                         /* Take abs values of MM4 and MM5 */

                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */

                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */

                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */

                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */

                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */

                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */

                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */

                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */

                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */

                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */

                         /* --- */

                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */

                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */

                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */

                         /* --- */

                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */

                         "jnz            .L10412 \n\t"   /* check loop termination, proceed if required */

                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */

                         "mov       %%edx, %%edi \n\t"   /* restore most left current row Dest address */

                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */

                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */

                         "decl                %2 \n\t"   /* decrease loop counter ROWS */

                         "jnz            .L10410 \n\t"   /* check loop termination, proceed if required */

                         /* --- */

                         "emms                   \n\t"   /* exit MMX state */

                         "popa                   \n\t":"=m" (Dest)       /* %0 */

                         :"m"(Src),              /* %1 */

                         "m"(rows),              /* %2 */

                         "m"(columns),           /* %3 */

                         "m"(NRightShift)        /* %4 */

                         );

 #endif

 #endif

                 return (0);

         } else {

                 /* No non-MMX implementation yet */

                 return (-1);

         }

 }


 void SDL_imageFilterAlignStack(void)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {                               /* --- stack alignment --- */

                 mov ebx, esp    /* load ESP into EBX */

                         sub ebx, 4      /* reserve space on stack for old value of ESP */

                         and ebx, -32    /* align EBX along a 32 byte boundary */

                         mov [ebx], esp          /* save old value of ESP in stack, behind the bndry */

                         mov esp, ebx    /* align ESP along a 32 byte boundary */

         }

 #else

         asm volatile

                 (                               /* --- stack alignment --- */

                 "mov       %%esp, %%ebx \n\t"   /* load ESP into EBX */

                 "sub          $4, %%ebx \n\t"   /* reserve space on stack for old value of ESP */

                 "and        $-32, %%ebx \n\t"   /* align EBX along a 32 byte boundary */

                 "mov     %%esp, (%%ebx) \n\t"   /* save old value of ESP in stack, behind the bndry */

                 "mov       %%ebx, %%esp \n\t"   /* align ESP along a 32 byte boundary */

                 ::);

 #endif

 #endif

 }


 void SDL_imageFilterRestoreStack(void)

 {

 #ifdef USE_MMX

 #if !defined(GCC__)

         __asm

         {                               /* --- restoring old stack --- */

                 mov ebx, [esp]          /* load old value of ESP */

                 mov esp, ebx    /* restore old value of ESP */

         }

 #else

         asm volatile

                 (                               /* --- restoring old stack --- */

                 "mov     (%%esp), %%ebx \n\t"   /* load old value of ESP */

                 "mov       %%ebx, %%esp \n\t"   /* restore old value of ESP */

                 ::);

 #endif

 #endif

 }

SDL_imageFilterSobelX
int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
Filter using SobelX: Dij = saturation255( ... )
Definition: SDL2_imageFilter.c:6799

SDL_imageFilterMultByByte
int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using MultByByte: D = saturation255(S * C)
Definition: SDL2_imageFilter.c:2790

SDL_imageFilterShiftRightAndMultByByte
int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, unsigned char C)
Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C)
Definition: SDL2_imageFilter.c:2943

SDL_imageFilterSobelXShiftRight
int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, unsigned char NRightShift)
Filter using SobelXShiftRight: Dij = saturation255( ... )
Definition: SDL2_imageFilter.c:7052

SDL_imageFilterDiv
int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Div: D = S1 / S2.
Definition: SDL2_imageFilter.c:1549

SDL_imageFilterShiftLeftUint
int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftUint: D = ((uint)S << N)
Definition: SDL2_imageFilter.c:3210

SDL_imageFilterMultDivby4
int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
Definition: SDL2_imageFilter.c:1141

SDL_imageFilterRestoreStack
void SDL_imageFilterRestoreStack(void)
Restore previously aligned stack.
Definition: SDL2_imageFilter.c:7354

SDL_imageFilterMultNorASM
int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
Internal ASM Filter using MultNor: D = S1 * S2.
Definition: SDL2_imageFilter.c:792

SDL_imageFilterMMXon
void SDL_imageFilterMMXon()
Enable MMX check for filter functions and use MMX code if available.
Definition: SDL2_imageFilter.c:101

SDL_imageFilterConvolveKernel7x7Divide
int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:4473

SDL_imageFilterSubByte
int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using SubByte: D = saturation0(S - C)
Definition: SDL2_imageFilter.c:2196

SDL_imageFilterSub
int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Sub: D = saturation0(S1 - S2)
Definition: SDL2_imageFilter.c:422

SDL_imageFilterShiftLeftByte
int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftByte: D = (S << N)
Definition: SDL2_imageFilter.c:3093

SDL_imageFilterAbsDiff
int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using AbsDiff: D = | S1 - S2 |.
Definition: SDL2_imageFilter.c:542

SDL_imageFilterConvolveKernel7x7ShiftRight
int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:5856

SDL_imageFilterConvolveKernel5x5ShiftRight
int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:5555

SDL_imageFilterShiftRightUint
int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
Definition: SDL2_imageFilter.c:2594

SDL_imageFilterBitOr
int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitOr: D = S1 | S2.
Definition: SDL2_imageFilter.c:1392

SDL_imageFilterMMXoff
void SDL_imageFilterMMXoff()
Disable MMX check for filter functions and and force to use non-MMX C based code.
Definition: SDL2_imageFilter.c:93

SDL_imageFilterMultNor
int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultNor: D = S1 * S2.
Definition: SDL2_imageFilter.c:862

SDL_imageFilterSubByteMMX
int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
Internal MMX Filter using SubByte: D = saturation0(S - C)
Definition: SDL2_imageFilter.c:2130

SDL_imageFilterAddUint
int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
Definition: SDL2_imageFilter.c:1919

SDL_imageFilterShiftRight
int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRight: D = saturation0(S >> N)
Definition: SDL2_imageFilter.c:2476

SDL_imageFilterConvolveKernel9x9ShiftRight
int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... )
Definition: SDL2_imageFilter.c:6219

SWAP_32
#define SWAP_32(x)
Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
Definition: SDL2_imageFilter.c:61

SDL_imageFilterMMXdetect
int SDL_imageFilterMMXdetect(void)
MMX detection routine (with override flag).
Definition: SDL2_imageFilter.c:80

SDL_imageFilterMultDivby2
int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby2: D = saturation255(S1/2 * S2)
Definition: SDL2_imageFilter.c:1000

SDL_imageFilterAddByte
int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByte: D = saturation255(S + C)
Definition: SDL2_imageFilter.c:1791

SDL_imageFilterBitAnd
int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitAnd: D = S1 & S2.
Definition: SDL2_imageFilter.c:1278

SDL_imageFilterConvolveKernel3x3Divide
int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:3980

SDL_imageFilterBinarizeUsingThreshold
int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0.
Definition: SDL2_imageFilter.c:3534

SDL_imageFilterShiftLeft
int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter ShiftLeft: D = saturation255(S << N)
Definition: SDL2_imageFilter.c:3393

SDL_imageFilterAdd
int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Add: D = saturation255(S1 + S2)
Definition: SDL2_imageFilter.c:173

SDL_imageFilterNormalizeLinear
int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, int Nmax)
Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
Definition: SDL2_imageFilter.c:3909

SDL_imageFilterClipToRange
int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, unsigned char Tmax)
Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax.
Definition: SDL2_imageFilter.c:3691

SDL_imageFilterAddByteToHalf
int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByteToHalf: D = saturation255(S/2 + C)
Definition: SDL2_imageFilter.c:2068

SDL_imageFilterSubUint
int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
Definition: SDL2_imageFilter.c:2325

SDL_imageFilterConvolveKernel3x3ShiftRight
int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:5378

SDL_imageFilterBitNegation
int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
Filter using BitNegation: D = !S.
Definition: SDL2_imageFilter.c:1671

SDL_imageFilterConvolveKernel5x5Divide
int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:4170

SDL_imageFilterMean
int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mean: D = S1/2 + S2/2.
Definition: SDL2_imageFilter.c:308

SDL_imageFilterConvolveKernel9x9Divide
int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... )
Definition: SDL2_imageFilter.c:4830

SDL_imageFilterMult
int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mult: D = saturation255(S1 * S2)
Definition: SDL2_imageFilter.c:729

SDL_imageFilterAlignStack
void SDL_imageFilterAlignStack(void)
Align stack to 32 byte boundary,.
Definition: SDL2_imageFilter.c:7326

SDL2_imageFilter.h