SDL2_gfx  1.0.2
Graphics primitives and surface functions for SDL2
/build/libsdl2-gfx-F3NxcE/libsdl2-gfx-1.0.4+dfsg/SDL2_imageFilter.c
Go to the documentation of this file.
1 /*
2 
3 SDL2_imageFilter.c: byte-image "filter" routines
4 
5 Copyright (C) 2012-2014 Andreas Schiffler
6 Copyright (C) 2013 Sylvain Beucler
7 
8 This software is provided 'as-is', without any express or implied
9 warranty. In no event will the authors be held liable for any damages
10 arising from the use of this software.
11 
12 Permission is granted to anyone to use this software for any purpose,
13 including commercial applications, and to alter it and redistribute it
14 freely, subject to the following restrictions:
15 
16  1. The origin of this software must not be misrepresented; you must not
17  claim that you wrote the original software. If you use this software
18  in a product, an acknowledgment in the product documentation would be
19  appreciated but is not required.
20 
21  2. Altered source versions must be plainly marked as such, and must not be
22  misrepresented as being the original software.
23 
24  3. This notice may not be removed or altered from any source
25  distribution.
26 
27 Andreas Schiffler -- aschiffler at ferzkopp dot net
28 
29 */
30 
31 /*
32 
33 Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
34 
35 Note: Most of the MMX code is based on published routines
36 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to
37 him for his work.
38 
39 */
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 
45 #include "SDL.h"
46 
47 /* Use GCC intrinsics if available: they support both i386 and x86_64,
48  provide ASM-grade performances, and lift the PUSHA/POPA issues. */
49 #ifdef __GNUC__
50 # ifdef USE_MMX
51 # include <mmintrin.h>
52 # endif
53 # include <SDL_cpuinfo.h>
54 #endif
55 
56 #include "SDL2_imageFilter.h"
57 
61 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24))
62 
63 /* ------ Static variables ----- */
64 
68 static int SDL_imageFilterUseMMX = 1;
69 
70 /* Detect GCC */
71 #if defined(__GNUC__)
72 #define GCC__
73 #endif
74 
81 {
82  /* Check override flag */
83  if (SDL_imageFilterUseMMX == 0) {
84  return (0);
85  }
86 
87  return SDL_HasMMX();
88 }
89 
94 {
95  SDL_imageFilterUseMMX = 0;
96 }
97 
102 {
103  SDL_imageFilterUseMMX = 1;
104 }
105 
106 /* ------------------------------------------------------------------------------------ */
107 
118 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
119 {
120 #ifdef USE_MMX
121 #if !defined(GCC__)
122  __asm
123  {
124  pusha
125  mov eax, Src1 /* load Src1 address into eax */
126  mov ebx, Src2 /* load Src2 address into ebx */
127  mov edi, Dest /* load Dest address into edi */
128  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
129  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
130  align 16 /* 16 byte alignment of the loop entry */
131 L1010:
132  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
133  paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */
134  movq [edi], mm1 /* store result in Dest */
135  add eax, 8 /* increase Src1, Src2 and Dest */
136  add ebx, 8 /* register pointers by 8 */
137  add edi, 8
138  dec ecx /* decrease loop counter */
139  jnz L1010 /* check loop termination, proceed if required */
140  emms /* exit MMX state */
141  popa
142  }
143 #else
144  /* i386 and x86_64 */
145  __m64 *mSrc1 = (__m64*)Src1;
146  __m64 *mSrc2 = (__m64*)Src2;
147  __m64 *mDest = (__m64*)Dest;
148  int i;
149  for (i = 0; i < SrcLength/8; i++) {
150  *mDest = _m_paddusb(*mSrc1, *mSrc2); /* Src1+Src2 (add 8 bytes with saturation) */
151  mSrc1++;
152  mSrc2++;
153  mDest++;
154  }
155  _m_empty(); /* clean MMX state */
156 #endif
157  return (0);
158 #else
159  return (-1);
160 #endif
161 }
162 
173 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
174 {
175  unsigned int i, istart;
176  unsigned char *cursrc1, *cursrc2, *curdst;
177  int result;
178 
179  /* Validate input parameters */
180  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
181  return(-1);
182  if (length == 0)
183  return(0);
184 
185  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
186 
187  /* Use MMX assembly routine */
188  SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
189 
190  /* Check for unaligned bytes */
191  if ((length & 7) > 0) {
192  /* Setup to process unaligned bytes */
193  istart = length & 0xfffffff8;
194  cursrc1 = &Src1[istart];
195  cursrc2 = &Src2[istart];
196  curdst = &Dest[istart];
197  } else {
198  /* No unaligned bytes - we are done */
199  return (0);
200  }
201  } else {
202  /* Setup to process whole image */
203  istart = 0;
204  cursrc1 = Src1;
205  cursrc2 = Src2;
206  curdst = Dest;
207  }
208 
209  /* C routine to process image */
210  for (i = istart; i < length; i++) {
211  result = (int) *cursrc1 + (int) *cursrc2;
212  if (result > 255)
213  result = 255;
214  *curdst = (unsigned char) result;
215  /* Advance pointers */
216  cursrc1++;
217  cursrc2++;
218  curdst++;
219  }
220 
221  return (0);
222 }
223 
235 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
236  unsigned char *Mask)
237 {
238 #ifdef USE_MMX
239 #if !defined(GCC__)
240  __asm
241  {
242  pusha
243  mov edx, Mask /* load Mask address into edx */
244  movq mm0, [edx] /* load Mask into mm0 */
245  mov eax, Src1 /* load Src1 address into eax */
246  mov ebx, Src2 /* load Src2 address into ebx */
247  mov edi, Dest /* load Dest address into edi */
248  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
249  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
250  align 16 /* 16 byte alignment of the loop entry */
251 L21011:
252  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
253  movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
254  /* --- Byte shift via Word shift --- */
255  psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */
256  psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */
257  pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */
258  /* byte 0x0f, 0xdb, 0xc8 */
259  pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */
260  /* byte 0x0f, 0xdb, 0xd0 */
261  paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */
262  movq [edi], mm1 /* store result in Dest */
263  add eax, 8 /* increase Src1, Src2 and Dest */
264  add ebx, 8 /* register pointers by 8 */
265  add edi, 8
266  dec ecx /* decrease loop counter */
267  jnz L21011 /* check loop termination, proceed if required */
268  emms /* exit MMX state */
269  popa
270  }
271 #else
272  /* i386 and x86_64 */
273  __m64 *mSrc1 = (__m64*)Src1;
274  __m64 *mSrc2 = (__m64*)Src2;
275  __m64 *mDest = (__m64*)Dest;
276  __m64 *mMask = (__m64*)Mask;
277  int i;
278  for (i = 0; i < SrcLength/8; i++) {
279  __m64 mm1 = *mSrc1,
280  mm2 = *mSrc2;
281  mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of mm1 1 bit to the right */
282  mm2 = _m_psrlwi(mm2, 1); /* shift 4 WORDS of mm2 1 bit to the right */
283  mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of mm1 */
284  mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of mm2 */
285  *mDest = _m_paddusb(mm1, mm2); /* mm1+mm2 (add 8 bytes with saturation) */
286  mSrc1++;
287  mSrc2++;
288  mDest++;
289  }
290  _m_empty(); /* clean MMX state */
291 #endif
292  return (0);
293 #else
294  return (-1);
295 #endif
296 }
297 
308 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
309 {
310  static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
311  unsigned int i, istart;
312  unsigned char *cursrc1, *cursrc2, *curdst;
313  int result;
314 
315  /* Validate input parameters */
316  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
317  return(-1);
318  if (length == 0)
319  return(0);
320 
321  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
322  /* MMX routine */
323  SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
324 
325  /* Check for unaligned bytes */
326  if ((length & 7) > 0) {
327  /* Setup to process unaligned bytes */
328  istart = length & 0xfffffff8;
329  cursrc1 = &Src1[istart];
330  cursrc2 = &Src2[istart];
331  curdst = &Dest[istart];
332  } else {
333  /* No unaligned bytes - we are done */
334  return (0);
335  }
336  } else {
337  /* Setup to process whole image */
338  istart = 0;
339  cursrc1 = Src1;
340  cursrc2 = Src2;
341  curdst = Dest;
342  }
343 
344  /* C routine to process image */
345  for (i = istart; i < length; i++) {
346  result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
347  *curdst = (unsigned char) result;
348  /* Advance pointers */
349  cursrc1++;
350  cursrc2++;
351  curdst++;
352  }
353 
354  return (0);
355 }
356 
367 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
368 {
369 #ifdef USE_MMX
370 #if !defined(GCC__)
371  __asm
372  {
373  pusha
374  mov eax, Src1 /* load Src1 address into eax */
375  mov ebx, Src2 /* load Src2 address into ebx */
376  mov edi, Dest /* load Dest address into edi */
377  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
378  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
379  align 16 /* 16 byte alignment of the loop entry */
380 L1012:
381  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
382  psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
383  movq [edi], mm1 /* store result in Dest */
384  add eax, 8 /* increase Src1, Src2 and Dest */
385  add ebx, 8 /* register pointers by 8 */
386  add edi, 8
387  dec ecx /* decrease loop counter */
388  jnz L1012 /* check loop termination, proceed if required */
389  emms /* exit MMX state */
390  popa
391  }
392 #else
393  /* i386 and x86_64 */
394  __m64 *mSrc1 = (__m64*)Src1;
395  __m64 *mSrc2 = (__m64*)Src2;
396  __m64 *mDest = (__m64*)Dest;
397  int i;
398  for (i = 0; i < SrcLength/8; i++) {
399  *mDest = _m_psubusb(*mSrc1, *mSrc2); /* Src1-Src2 (sub 8 bytes with saturation) */
400  mSrc1++;
401  mSrc2++;
402  mDest++;
403  }
404  _m_empty(); /* clean MMX state */
405 #endif
406  return (0);
407 #else
408  return (-1);
409 #endif
410 }
411 
422 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
423 {
424  unsigned int i, istart;
425  unsigned char *cursrc1, *cursrc2, *curdst;
426  int result;
427 
428  /* Validate input parameters */
429  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
430  return(-1);
431  if (length == 0)
432  return(0);
433 
434  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
435  /* MMX routine */
436  SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
437 
438  /* Check for unaligned bytes */
439  if ((length & 7) > 0) {
440  /* Setup to process unaligned bytes */
441  istart = length & 0xfffffff8;
442  cursrc1 = &Src1[istart];
443  cursrc2 = &Src2[istart];
444  curdst = &Dest[istart];
445  } else {
446  /* No unaligned bytes - we are done */
447  return (0);
448  }
449  } else {
450  /* Setup to process whole image */
451  istart = 0;
452  cursrc1 = Src1;
453  cursrc2 = Src2;
454  curdst = Dest;
455  }
456 
457  /* C routine to process image */
458  for (i = istart; i < length; i++) {
459  result = (int) *cursrc1 - (int) *cursrc2;
460  if (result < 0)
461  result = 0;
462  *curdst = (unsigned char) result;
463  /* Advance pointers */
464  cursrc1++;
465  cursrc2++;
466  curdst++;
467  }
468 
469  return (0);
470 }
471 
482 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
483 {
484 #ifdef USE_MMX
485 #if !defined(GCC__)
486  __asm
487  {
488  pusha
489  mov eax, Src1 /* load Src1 address into eax */
490  mov ebx, Src2 /* load Src2 address into ebx */
491  mov edi, Dest /* load Dest address into edi */
492  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
493  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
494  align 16 /* 16 byte alignment of the loop entry */
495 L1013:
496  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
497  movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
498  psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
499  psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
500  por mm1, mm2 /* combine both mm2 and mm1 results */
501  movq [edi], mm1 /* store result in Dest */
502  add eax, 8 /* increase Src1, Src2 and Dest */
503  add ebx, 8 /* register pointers by 8 */
504  add edi, 8
505  dec ecx /* decrease loop counter */
506  jnz L1013 /* check loop termination, proceed if required */
507  emms /* exit MMX state */
508  popa
509  }
510 #else
511  /* i386 and x86_64 */
512  __m64 *mSrc1 = (__m64*)Src1;
513  __m64 *mSrc2 = (__m64*)Src2;
514  __m64 *mDest = (__m64*)Dest;
515  int i;
516  for (i = 0; i < SrcLength/8; i++) {
517  __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */
518  __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */
519  *mDest = _m_por(mm1, mm2); /* combine both mm2 and mm1 results */
520  mSrc1++;
521  mSrc2++;
522  mDest++;
523  }
524  _m_empty(); /* clean MMX state */
525 #endif
526  return (0);
527 #else
528  return (-1);
529 #endif
530 }
531 
542 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
543 {
544  unsigned int i, istart;
545  unsigned char *cursrc1, *cursrc2, *curdst;
546  int result;
547 
548  /* Validate input parameters */
549  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
550  return(-1);
551  if (length == 0)
552  return(0);
553 
554  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
555  /* MMX routine */
556  SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
557 
558  /* Check for unaligned bytes */
559  if ((length & 7) > 0) {
560  /* Setup to process unaligned bytes */
561  istart = length & 0xfffffff8;
562  cursrc1 = &Src1[istart];
563  cursrc2 = &Src2[istart];
564  curdst = &Dest[istart];
565  } else {
566  /* No unaligned bytes - we are done */
567  return (0);
568  }
569  } else {
570  /* Setup to process whole image */
571  istart = 0;
572  cursrc1 = Src1;
573  cursrc2 = Src2;
574  curdst = Dest;
575  }
576 
577  /* C routine to process image */
578  for (i = istart; i < length; i++) {
579  result = abs((int) *cursrc1 - (int) *cursrc2);
580  *curdst = (unsigned char) result;
581  /* Advance pointers */
582  cursrc1++;
583  cursrc2++;
584  curdst++;
585  }
586 
587  return (0);
588 }
589 
600 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
601 {
602 #ifdef USE_MMX
603 #if !defined(GCC__)
604  __asm
605  {
606  pusha
607  mov eax, Src1 /* load Src1 address into eax */
608  mov ebx, Src2 /* load Src2 address into ebx */
609  mov edi, Dest /* load Dest address into edi */
610  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
611  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
612  pxor mm0, mm0 /* zero mm0 register */
613  align 16 /* 16 byte alignment of the loop entry */
614 L1014:
615  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
616  movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
617  movq mm2, mm1 /* copy mm1 into mm2 */
618  movq mm4, mm3 /* copy mm3 into mm4 */
619  punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
620  punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
621  punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
622  punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
623  pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
624  pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
625  /* Take abs value of the results (signed words) */
626  movq mm5, mm1 /* copy mm1 into mm5 */
627  movq mm6, mm2 /* copy mm2 into mm6 */
628  psraw mm5, 15 /* fill mm5 words with word sign bit */
629  psraw mm6, 15 /* fill mm6 words with word sign bit */
630  pxor mm1, mm5 /* take 1's compliment of only neg. words */
631  pxor mm2, mm6 /* take 1's compliment of only neg. words */
632  psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */
633  psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */
634  packuswb mm1, mm2 /* pack words back into bytes with saturation */
635  movq [edi], mm1 /* store result in Dest */
636  add eax, 8 /* increase Src1, Src2 and Dest */
637  add ebx, 8 /* register pointers by 8 */
638  add edi, 8
639  dec ecx /* decrease loop counter */
640  jnz L1014 /* check loop termination, proceed if required */
641  emms /* exit MMX state */
642  popa
643  }
644 #else
645  /* i386 ASM with constraints: */
646  /* asm volatile ( */
647  /* "shr $3, %%ecx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
648  /* "pxor %%mm0, %%mm0 \n\t" /\* zero mm0 register *\/ */
649  /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */
650  /* "1: movq (%%eax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */
651  /* "movq (%%ebx), %%mm3 \n\t" /\* load 8 bytes from Src2 into mm3 *\/ */
652  /* "movq %%mm1, %%mm2 \n\t" /\* copy mm1 into mm2 *\/ */
653  /* "movq %%mm3, %%mm4 \n\t" /\* copy mm3 into mm4 *\/ */
654  /* "punpcklbw %%mm0, %%mm1 \n\t" /\* unpack low bytes of Src1 into words *\/ */
655  /* "punpckhbw %%mm0, %%mm2 \n\t" /\* unpack high bytes of Src1 into words *\/ */
656  /* "punpcklbw %%mm0, %%mm3 \n\t" /\* unpack low bytes of Src2 into words *\/ */
657  /* "punpckhbw %%mm0, %%mm4 \n\t" /\* unpack high bytes of Src2 into words *\/ */
658  /* "pmullw %%mm3, %%mm1 \n\t" /\* mul low bytes of Src1 and Src2 *\/ */
659  /* "pmullw %%mm4, %%mm2 \n\t" /\* mul high bytes of Src1 and Src2 *\/ */
660  /* /\* Take abs value of the results (signed words) *\/ */
661  /* "movq %%mm1, %%mm5 \n\t" /\* copy mm1 into mm5 *\/ */
662  /* "movq %%mm2, %%mm6 \n\t" /\* copy mm2 into mm6 *\/ */
663  /* "psraw $15, %%mm5 \n\t" /\* fill mm5 words with word sign bit *\/ */
664  /* "psraw $15, %%mm6 \n\t" /\* fill mm6 words with word sign bit *\/ */
665  /* "pxor %%mm5, %%mm1 \n\t" /\* take 1's compliment of only neg. words *\/ */
666  /* "pxor %%mm6, %%mm2 \n\t" /\* take 1's compliment of only neg. words *\/ */
667  /* "psubsw %%mm5, %%mm1 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
668  /* "psubsw %%mm6, %%mm2 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
669  /* "packuswb %%mm2, %%mm1 \n\t" /\* pack words back into bytes with saturation *\/ */
670  /* "movq %%mm1, (%%edi) \n\t" /\* store result in Dest *\/ */
671  /* "add $8, %%eax \n\t" /\* increase Src1, Src2 and Dest *\/ */
672  /* "add $8, %%ebx \n\t" /\* register pointers by 8 *\/ */
673  /* "add $8, %%edi \n\t" */
674  /* "dec %%ecx \n\t" /\* decrease loop counter *\/ */
675  /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */
676  /* "emms \n\t" /\* exit MMX state *\/ */
677  /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */
678  /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */
679  /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
680  /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */
681  /* : */
682  /* : "memory", /\* *Dest is modified *\/ */
683  /* "mm0","mm1","mm2","mm3","mm4","mm5","mm6" /\* registers modified *\/ */
684  /* ); */
685 
686  /* i386 and x86_64 */
687  __m64 *mSrc1 = (__m64*)Src1;
688  __m64 *mSrc2 = (__m64*)Src2;
689  __m64 *mDest = (__m64*)Dest;
690  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
691  int i;
692  for (i = 0; i < SrcLength/8; i++) {
693  __m64 mm1, mm2, mm3, mm4, mm5, mm6;
694  mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
695  mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
696  mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
697  mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
698  mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
699  mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
700  mm5 = _m_psrawi(mm1, 15); /* fill mm5 words with word sign bit */
701  mm6 = _m_psrawi(mm2, 15); /* fill mm6 words with word sign bit */
702  mm1 = _m_pxor(mm1, mm5); /* take 1's compliment of only neg. words */
703  mm2 = _m_pxor(mm2, mm6); /* take 1's compliment of only neg. words */
704  mm1 = _m_psubsw(mm1, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
705  mm2 = _m_psubsw(mm2, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
706  *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
707  mSrc1++;
708  mSrc2++;
709  mDest++;
710  }
711  _m_empty(); /* clean MMX state */
712 #endif
713  return (0);
714 #else
715  return (-1);
716 #endif
717 }
718 
729 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
730 {
731  unsigned int i, istart;
732  unsigned char *cursrc1, *cursrc2, *curdst;
733  int result;
734 
735  /* Validate input parameters */
736  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
737  return(-1);
738  if (length == 0)
739  return(0);
740 
741  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
742  /* MMX routine */
743  SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
744 
745  /* Check for unaligned bytes */
746  if ((length & 7) > 0) {
747  /* Setup to process unaligned bytes */
748  istart = length & 0xfffffff8;
749  cursrc1 = &Src1[istart];
750  cursrc2 = &Src2[istart];
751  curdst = &Dest[istart];
752  } else {
753  /* No unaligned bytes - we are done */
754  return (0);
755  }
756  } else {
757  /* Setup to process whole image */
758  istart = 0;
759  cursrc1 = Src1;
760  cursrc2 = Src2;
761  curdst = Dest;
762  }
763 
764  /* C routine to process image */
765  for (i = istart; i < length; i++) {
766 
767  /* NOTE: this is probably wrong - dunno what the MMX code does */
768 
769  result = (int) *cursrc1 * (int) *cursrc2;
770  if (result > 255)
771  result = 255;
772  *curdst = (unsigned char) result;
773  /* Advance pointers */
774  cursrc1++;
775  cursrc2++;
776  curdst++;
777  }
778 
779  return (0);
780 }
781 
792 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
793 {
794 #ifdef USE_MMX
795 #if !defined(GCC__)
796  __asm
797  {
798  pusha
799  mov edx, Src1 /* load Src1 address into edx */
800  mov esi, Src2 /* load Src2 address into esi */
801  mov edi, Dest /* load Dest address into edi */
802  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
803  align 16 /* 16 byte alignment of the loop entry */
804 L10141:
805  mov al, [edx] /* load a byte from Src1 */
806  mul [esi] /* mul with a byte from Src2 */
807  mov [edi], al /* move a byte result to Dest */
808  inc edx /* increment Src1, Src2, Dest */
809  inc esi /* pointer registers by one */
810  inc edi
811  dec ecx /* decrease loop counter */
812  jnz L10141 /* check loop termination, proceed if required */
813  popa
814  }
815 #else
816  /* Note: ~5% gain on i386, less efficient than C on x86_64 */
817  /* Also depends on whether this function is static (?!) */
818  asm volatile (
819  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
820 # if defined(i386)
821  "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */
822  "mulb (%%esi) \n\t" /* mul with a byte from Src2 */
823  "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
824  "inc %%edx \n\t" /* increment Src1, Src2, Dest */
825  "inc %%esi \n\t" /* pointer registers by one */
826  "inc %%edi \n\t"
827  "dec %%ecx \n\t" /* decrease loop counter */
828 # elif defined(__x86_64__)
829  "1:mov (%%rdx), %%al \n\t" /* load a byte from Src1 */
830  "mulb (%%rsi) \n\t" /* mul with a byte from Src2 */
831  "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */
832  "inc %%rdx \n\t" /* increment Src1, Src2, Dest */
833  "inc %%rsi \n\t" /* pointer registers by one */
834  "inc %%rdi \n\t"
835  "dec %%rcx \n\t" /* decrease loop counter */
836 # endif
837  "jnz 1b \n\t" /* check loop termination, proceed if required */
838  : "+d" (Src1), /* load Src1 address into edx */
839  "+S" (Src2), /* load Src2 address into esi */
840  "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
841  "+D" (Dest) /* load Dest address into edi */
842  :
843  : "memory", "rax"
844  );
845 #endif
846  return (0);
847 #else
848  return (-1);
849 #endif
850 }
851 
862 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
863 {
864  unsigned int i, istart;
865  unsigned char *cursrc1, *cursrc2, *curdst;
866 
867  /* Validate input parameters */
868  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
869  return(-1);
870  if (length == 0)
871  return(0);
872 
873  if (SDL_imageFilterMMXdetect()) {
874  if (length > 0) {
875  /* ASM routine */
876  SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
877 
878  /* Check for unaligned bytes */
879  if ((length & 7) > 0) {
880  /* Setup to process unaligned bytes */
881  istart = length & 0xfffffff8;
882  cursrc1 = &Src1[istart];
883  cursrc2 = &Src2[istart];
884  curdst = &Dest[istart];
885  } else {
886  /* No unaligned bytes - we are done */
887  return (0);
888  }
889  } else {
890  /* No bytes - we are done */
891  return (0);
892  }
893  } else {
894  /* Setup to process whole image */
895  istart = 0;
896  cursrc1 = Src1;
897  cursrc2 = Src2;
898  curdst = Dest;
899  }
900 
901  /* C routine to process image */
902  for (i = istart; i < length; i++) {
903  *curdst = (int)*cursrc1 * (int)*cursrc2; // (int) for efficiency
904  /* Advance pointers */
905  cursrc1++;
906  cursrc2++;
907  curdst++;
908  }
909 
910  return (0);
911 }
912 
923 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
924 {
925 #ifdef USE_MMX
926 #if !defined(GCC__)
927  __asm
928  {
929  pusha
930  mov eax, Src1 /* load Src1 address into eax */
931  mov ebx, Src2 /* load Src2 address into ebx */
932  mov edi, Dest /* load Dest address into edi */
933  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
934  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
935  pxor mm0, mm0 /* zero mm0 register */
936  align 16 /* 16 byte alignment of the loop entry */
937 L1015:
938  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
939  movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
940  movq mm2, mm1 /* copy mm1 into mm2 */
941  movq mm4, mm3 /* copy mm3 into mm4 */
942  punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
943  punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
944  punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
945  punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
946  psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
947  psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
948  pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
949  pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
950  packuswb mm1, mm2 /* pack words back into bytes with saturation */
951  movq [edi], mm1 /* store result in Dest */
952  add eax, 8 /* increase Src1, Src2 and Dest */
953  add ebx, 8 /* register pointers by 8 */
954  add edi, 8
955  dec ecx /* decrease loop counter */
956  jnz L1015 /* check loop termination, proceed if required */
957  emms /* exit MMX state */
958  popa
959  }
960 #else
961  /* i386 and x86_64 */
962  __m64 *mSrc1 = (__m64*)Src1;
963  __m64 *mSrc2 = (__m64*)Src2;
964  __m64 *mDest = (__m64*)Dest;
965  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
966  int i;
967  for (i = 0; i < SrcLength/8; i++) {
968  __m64 mm1, mm2, mm3, mm4, mm5, mm6;
969  mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
970  mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
971  mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
972  mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
973  mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */
974  mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */
975  mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
976  mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
977  *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
978  mSrc1++;
979  mSrc2++;
980  mDest++;
981  }
982  _m_empty(); /* clean MMX state */
983 #endif
984  return (0);
985 #else
986  return (-1);
987 #endif
988 }
989 
1000 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1001 {
1002  unsigned int i, istart;
1003  unsigned char *cursrc1, *cursrc2, *curdst;
1004  int result;
1005 
1006  /* Validate input parameters */
1007  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1008  return(-1);
1009  if (length == 0)
1010  return(0);
1011 
1012  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1013  /* MMX routine */
1014  SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
1015 
1016  /* Check for unaligned bytes */
1017  if ((length & 7) > 0) {
1018  /* Setup to process unaligned bytes */
1019  istart = length & 0xfffffff8;
1020  cursrc1 = &Src1[istart];
1021  cursrc2 = &Src2[istart];
1022  curdst = &Dest[istart];
1023  } else {
1024  /* No unaligned bytes - we are done */
1025  return (0);
1026  }
1027  } else {
1028  /* Setup to process whole image */
1029  istart = 0;
1030  cursrc1 = Src1;
1031  cursrc2 = Src2;
1032  curdst = Dest;
1033  }
1034 
1035  /* C routine to process image */
1036  for (i = istart; i < length; i++) {
1037  result = ((int) *cursrc1 / 2) * (int) *cursrc2;
1038  if (result > 255)
1039  result = 255;
1040  *curdst = (unsigned char) result;
1041  /* Advance pointers */
1042  cursrc1++;
1043  cursrc2++;
1044  curdst++;
1045  }
1046 
1047  return (0);
1048 }
1049 
1060 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1061 {
1062 #ifdef USE_MMX
1063 #if !defined(GCC__)
1064  __asm
1065  {
1066  pusha
1067  mov eax, Src1 /* load Src1 address into eax */
1068  mov ebx, Src2 /* load Src2 address into ebx */
1069  mov edi, Dest /* load Dest address into edi */
1070  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1071  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1072  pxor mm0, mm0 /* zero mm0 register */
1073  align 16 /* 16 byte alignment of the loop entry */
1074 L1016:
1075  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1076  movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
1077  movq mm2, mm1 /* copy mm1 into mm2 */
1078  movq mm4, mm3 /* copy mm3 into mm4 */
1079  punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
1080  punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
1081  punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
1082  punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
1083  psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
1084  psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
1085  psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */
1086  psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */
1087  pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
1088  pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
1089  packuswb mm1, mm2 /* pack words back into bytes with saturation */
1090  movq [edi], mm1 /* store result in Dest */
1091  add eax, 8 /* increase Src1, Src2 and Dest */
1092  add ebx, 8 /* register pointers by 8 */
1093  add edi, 8
1094  dec ecx /* decrease loop counter */
1095  jnz L1016 /* check loop termination, proceed if required */
1096  emms /* exit MMX state */
1097  popa
1098  }
1099 #else
1100  /* i386 and x86_64 */
1101  __m64 *mSrc1 = (__m64*)Src1;
1102  __m64 *mSrc2 = (__m64*)Src2;
1103  __m64 *mDest = (__m64*)Dest;
1104  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
1105  int i;
1106  for (i = 0; i < SrcLength/8; i++) {
1107  __m64 mm1, mm2, mm3, mm4, mm5, mm6;
1108  mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
1109  mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
1110  mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
1111  mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
1112  mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */
1113  mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */
1114  mm3 = _m_psrlwi(mm3, 1); /* divide mm3 words by 2, Src2 low bytes */
1115  mm4 = _m_psrlwi(mm4, 1); /* divide mm4 words by 2, Src2 high bytes */
1116  mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
1117  mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
1118  *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
1119  mSrc1++;
1120  mSrc2++;
1121  mDest++;
1122  }
1123  _m_empty(); /* clean MMX state */
1124 #endif
1125  return (0);
1126 #else
1127  return (-1);
1128 #endif
1129 }
1130 
1141 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1142 {
1143  unsigned int i, istart;
1144  unsigned char *cursrc1, *cursrc2, *curdst;
1145  int result;
1146 
1147  /* Validate input parameters */
1148  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1149  return(-1);
1150  if (length == 0)
1151  return(0);
1152 
1153  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1154  /* MMX routine */
1155  SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
1156 
1157  /* Check for unaligned bytes */
1158  if ((length & 7) > 0) {
1159  /* Setup to process unaligned bytes */
1160  istart = length & 0xfffffff8;
1161  cursrc1 = &Src1[istart];
1162  cursrc2 = &Src2[istart];
1163  curdst = &Dest[istart];
1164  } else {
1165  /* No unaligned bytes - we are done */
1166  return (0);
1167  }
1168  } else {
1169  /* Setup to process whole image */
1170  istart = 0;
1171  cursrc1 = Src1;
1172  cursrc2 = Src2;
1173  curdst = Dest;
1174  }
1175 
1176  /* C routine to process image */
1177  for (i = istart; i < length; i++) {
1178  result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
1179  if (result > 255)
1180  result = 255;
1181  *curdst = (unsigned char) result;
1182  /* Advance pointers */
1183  cursrc1++;
1184  cursrc2++;
1185  curdst++;
1186  }
1187 
1188  return (0);
1189 }
1190 
1201 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1202 {
1203 #ifdef USE_MMX
1204 #if !defined(GCC__)
1205  __asm
1206  {
1207  pusha
1208  mov eax, Src1 /* load Src1 address into eax */
1209  mov ebx, Src2 /* load Src2 address into ebx */
1210  mov edi, Dest /* load Dest address into edi */
1211  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1212  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1213  align 16 /* 16 byte alignment of the loop entry */
1214 L1017:
1215  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1216  pand mm1, [ebx] /* mm1=Src1&Src2 */
1217  movq [edi], mm1 /* store result in Dest */
1218  add eax, 8 /* increase Src1, Src2 and Dest */
1219  add ebx, 8 /* register pointers by 8 */
1220  add edi, 8
1221  dec ecx /* decrease loop counter */
1222  jnz L1017 /* check loop termination, proceed if required */
1223  emms /* exit MMX state */
1224  popa
1225  }
1226 #else
1227  /* x86_64 ASM with constraints: */
1228  /* asm volatile ( */
1229  /* "shr $3, %%rcx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
1230  /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */
1231  /* "1: movq (%%rax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */
1232  /* "pand (%%rbx), %%mm1 \n\t" /\* mm1=Src1&Src2 *\/ */
1233  /* "movq %%mm1, (%%rdi) \n\t" /\* store result in Dest *\/ */
1234  /* "add $8, %%rax \n\t" /\* increase Src1, Src2 and Dest *\/ */
1235  /* "add $8, %%rbx \n\t" /\* register pointers by 8 *\/ */
1236  /* "add $8, %%rdi \n\t" */
1237  /* "dec %%rcx \n\t" /\* decrease loop counter *\/ */
1238  /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */
1239  /* "emms \n\t" /\* exit MMX state *\/ */
1240  /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */
1241  /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */
1242  /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
1243  /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */
1244  /* : */
1245  /* : "memory", /\* *Dest is modified *\/ */
1246  /* "mm1" /\* register mm1 modified *\/ */
1247  /* ); */
1248 
1249  /* i386 and x86_64 */
1250  __m64 *mSrc1 = (__m64*)Src1;
1251  __m64 *mSrc2 = (__m64*)Src2;
1252  __m64 *mDest = (__m64*)Dest;
1253  int i;
1254  for (i = 0; i < SrcLength/8; i++) {
1255  *mDest = _m_pand(*mSrc1, *mSrc2); /* Src1&Src2 */
1256  mSrc1++;
1257  mSrc2++;
1258  mDest++;
1259  }
1260  _m_empty(); /* clean MMX state */
1261 #endif
1262  return (0);
1263 #else
1264  return (-1);
1265 #endif
1266 }
1267 
1278 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1279 {
1280  unsigned int i, istart;
1281  unsigned char *cursrc1, *cursrc2, *curdst;
1282 
1283  /* Validate input parameters */
1284  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1285  return(-1);
1286  if (length == 0)
1287  return(0);
1288 
1289  if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
1290  /* if (length > 7) { */
1291  /* Call MMX routine */
1292 
1293  SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
1294 
1295  /* Check for unaligned bytes */
1296  if ((length & 7) > 0) {
1297 
1298  /* Setup to process unaligned bytes */
1299  istart = length & 0xfffffff8;
1300  cursrc1 = &Src1[istart];
1301  cursrc2 = &Src2[istart];
1302  curdst = &Dest[istart];
1303  } else {
1304  /* No unaligned bytes - we are done */
1305  return (0);
1306  }
1307  } else {
1308  /* Setup to process whole image */
1309  istart = 0;
1310  cursrc1 = Src1;
1311  cursrc2 = Src2;
1312  curdst = Dest;
1313  }
1314 
1315  /* C routine to process image */
1316  for (i = istart; i < length; i++) {
1317  *curdst = (*cursrc1) & (*cursrc2);
1318  /* Advance pointers */
1319  cursrc1++;
1320  cursrc2++;
1321  curdst++;
1322  }
1323 
1324  return (0);
1325 }
1326 
1337 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1338 {
1339 #ifdef USE_MMX
1340 #if !defined(GCC__)
1341  __asm
1342  {
1343  pusha
1344  mov eax, Src1 /* load Src1 address into eax */
1345  mov ebx, Src2 /* load Src2 address into ebx */
1346  mov edi, Dest /* load Dest address into edi */
1347  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1348  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1349  align 16 /* 16 byte alignment of the loop entry */
1350 L91017:
1351  movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1352  por mm1, [ebx] /* mm1=Src1|Src2 */
1353  movq [edi], mm1 /* store result in Dest */
1354  add eax, 8 /* increase Src1, Src2 and Dest */
1355  add ebx, 8 /* register pointers by 8 */
1356  add edi, 8
1357  dec ecx /* decrease loop counter */
1358  jnz L91017 /* check loop termination, proceed if required */
1359  emms /* exit MMX state */
1360  popa
1361  }
1362 #else
1363  /* i386 and x86_64 */
1364  __m64 *mSrc1 = (__m64*)Src1;
1365  __m64 *mSrc2 = (__m64*)Src2;
1366  __m64 *mDest = (__m64*)Dest;
1367  int i;
1368  for (i = 0; i < SrcLength/8; i++) {
1369  *mDest = _m_por(*mSrc1, *mSrc2); /* Src1|Src2 */
1370  mSrc1++;
1371  mSrc2++;
1372  mDest++;
1373  }
1374  _m_empty(); /* clean MMX state */
1375 #endif
1376  return (0);
1377 #else
1378  return (-1);
1379 #endif
1380 }
1381 
1392 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1393 {
1394  unsigned int i, istart;
1395  unsigned char *cursrc1, *cursrc2, *curdst;
1396 
1397  /* Validate input parameters */
1398  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1399  return(-1);
1400  if (length == 0)
1401  return(0);
1402 
1403  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1404 
1405  /* MMX routine */
1406  SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
1407 
1408  /* Check for unaligned bytes */
1409  if ((length & 7) > 0) {
1410  /* Setup to process unaligned bytes */
1411  istart = length & 0xfffffff8;
1412  cursrc1 = &Src1[istart];
1413  cursrc2 = &Src2[istart];
1414  curdst = &Dest[istart];
1415  } else {
1416  /* No unaligned bytes - we are done */
1417  return (0);
1418  }
1419  } else {
1420  /* Setup to process whole image */
1421  istart = 0;
1422  cursrc1 = Src1;
1423  cursrc2 = Src2;
1424  curdst = Dest;
1425  }
1426 
1427  /* C routine to process image */
1428  for (i = istart; i < length; i++) {
1429  *curdst = *cursrc1 | *cursrc2;
1430  /* Advance pointers */
1431  cursrc1++;
1432  cursrc2++;
1433  curdst++;
1434  }
1435  return (0);
1436 }
1437 
1448 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1449 {
1450 #ifdef USE_MMX
1451 #if !defined(GCC__)
1452  __asm
1453  {
1454  pusha
1455  mov edx, Src1 /* load Src1 address into edx */
1456  mov esi, Src2 /* load Src2 address into esi */
1457  mov edi, Dest /* load Dest address into edi */
1458  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1459  align 16 /* 16 byte alignment of the loop entry */
1460 L10191:
1461  mov bl, [esi] /* load a byte from Src2 */
1462  cmp bl, 0 /* check if it zero */
1463  jnz L10192
1464  mov [edi], 255 /* division by zero = 255 !!! */
1465  jmp L10193
1466 L10192:
1467  xor ah, ah /* prepare AX, zero AH register */
1468  mov al, [edx] /* load a byte from Src1 into AL */
1469  div bl /* divide AL by BL */
1470  mov [edi], al /* move a byte result to Dest */
1471 L10193:
1472  inc edx /* increment Src1, Src2, Dest */
1473  inc esi /* pointer registers by one */
1474  inc edi
1475  dec ecx /* decrease loop counter */
1476  jnz L10191 /* check loop termination, proceed if required */
1477  popa
1478  }
1479 #else
1480  /* Note: ~15% gain on i386, less efficient than C on x86_64 */
1481  /* Also depends on whether the function is static (?!) */
1482  /* Also depends on whether we work on malloc() or static char[] */
1483  asm volatile (
1484 # if defined(i386)
1485  "pushl %%ebx \n\t" /* %ebx may be the PIC register. */
1486  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
1487  "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */
1488  "cmp $0, %%bl \n\t" /* check if it zero */
1489  "jnz 2f \n\t"
1490  "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */
1491  "jmp 3f \n\t"
1492  "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
1493  "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */
1494  "div %%bl \n\t" /* divide AL by BL */
1495  "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
1496  "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */
1497  "inc %%esi \n\t" /* pointer registers by one */
1498  "inc %%edi \n\t"
1499  "dec %%ecx \n\t" /* decrease loop counter */
1500  "jnz 1b \n\t" /* check loop termination, proceed if required */
1501  "popl %%ebx \n\t" /* restore %ebx */
1502  : "+d" (Src1), /* load Src1 address into edx */
1503  "+S" (Src2), /* load Src2 address into esi */
1504  "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
1505  "+D" (Dest) /* load Dest address into edi */
1506  :
1507  : "memory", "rax"
1508 # elif defined(__x86_64__)
1509  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
1510  "1: mov (%%rsi), %%bl \n\t" /* load a byte from Src2 */
1511  "cmp $0, %%bl \n\t" /* check if it zero */
1512  "jnz 2f \n\t"
1513  "movb $255, (%%rdi) \n\t" /* division by zero = 255 !!! */
1514  "jmp 3f \n\t"
1515  "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
1516  "mov (%%rdx), %%al \n\t" /* load a byte from Src1 into AL */
1517  "div %%bl \n\t" /* divide AL by BL */
1518  "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */
1519  "3: inc %%rdx \n\t" /* increment Src1, Src2, Dest */
1520  "inc %%rsi \n\t" /* pointer registers by one */
1521  "inc %%rdi \n\t"
1522  "dec %%rcx \n\t" /* decrease loop counter */
1523  "jnz 1b \n\t" /* check loop termination, proceed if required */
1524  : "+d" (Src1), /* load Src1 address into edx */
1525  "+S" (Src2), /* load Src2 address into esi */
1526  "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
1527  "+D" (Dest) /* load Dest address into edi */
1528  :
1529  : "memory", "rax", "rbx"
1530 # endif
1531  );
1532 #endif
1533  return (0);
1534 #else
1535  return (-1);
1536 #endif
1537 }
1538 
1549 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1550 {
1551  unsigned int i, istart;
1552  unsigned char *cursrc1, *cursrc2, *curdst;
1553 
1554  /* Validate input parameters */
1555  if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1556  return(-1);
1557  if (length == 0)
1558  return(0);
1559 
1560  if (SDL_imageFilterMMXdetect()) {
1561  if (length > 0) {
1562  /* Call ASM routine */
1563  SDL_imageFilterDivASM(Src1, Src2, Dest, length);
1564 
1565  /* Never unaligned bytes - we are done */
1566  return (0);
1567  } else {
1568  return (-1);
1569  }
1570  }
1571 
1572  /* Setup to process whole image */
1573  istart = 0;
1574  cursrc1 = Src1;
1575  cursrc2 = Src2;
1576  curdst = Dest;
1577 
1578  /* C routine to process image */
1579  /* for (i = istart; i < length; i++) { */
1580  /* if (*cursrc2 == 0) { */
1581  /* *curdst = 255; */
1582  /* } else { */
1583  /* result = (int) *cursrc1 / (int) *cursrc2; */
1584  /* *curdst = (unsigned char) result; */
1585  /* } */
1586  /* /\* Advance pointers *\/ */
1587  /* cursrc1++; */
1588  /* cursrc2++; */
1589  /* curdst++; */
1590  /* } */
1591  for (i = istart; i < length; i++) {
1592  if (*cursrc2 == 0) {
1593  *curdst = 255;
1594  } else {
1595  *curdst = (int)*cursrc1 / (int)*cursrc2; // (int) for efficiency
1596  }
1597  /* Advance pointers */
1598  cursrc1++;
1599  cursrc2++;
1600  curdst++;
1601  }
1602 
1603  return (0);
1604 }
1605 
1606 /* ------------------------------------------------------------------------------------ */
1607 
1617 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
1618 {
1619 #ifdef USE_MMX
1620 #if !defined(GCC__)
1621  __asm
1622  {
1623  pusha
1624  pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
1625  mov eax, Src1 /* load Src1 address into eax */
1626  mov edi, Dest /* load Dest address into edi */
1627  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1628  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1629  align 16 /* 16 byte alignment of the loop entry */
1630 L91117:
1631  movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */
1632  pxor mm0, mm1 /* negate mm0 by xoring with mm1 */
1633  movq [edi], mm0 /* store result in Dest */
1634  add eax, 8 /* increase Src1, Src2 and Dest */
1635  add edi, 8
1636  dec ecx /* decrease loop counter */
1637  jnz L91117 /* check loop termination, proceed if required */
1638  emms /* exit MMX state */
1639  popa
1640  }
1641 #else
1642  /* i386 and x86_64 */
1643  __m64 *mSrc1 = (__m64*)Src1;
1644  __m64 *mDest = (__m64*)Dest;
1645  __m64 mm1;
1646  mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
1647  int i;
1648  for (i = 0; i < SrcLength/8; i++) {
1649  *mDest = _m_pxor(*mSrc1, mm1); /* negate mm0 by xoring with mm1 */
1650  mSrc1++;
1651  mDest++;
1652  }
1653  _m_empty(); /* clean MMX state */
1654 
1655 #endif
1656  return (0);
1657 #else
1658  return (-1);
1659 #endif
1660 }
1661 
1671 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
1672 {
1673  unsigned int i, istart;
1674  unsigned char *cursrc1, *curdst;
1675 
1676  /* Validate input parameters */
1677  if ((Src1 == NULL) || (Dest == NULL))
1678  return(-1);
1679  if (length == 0)
1680  return(0);
1681 
1682  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1683  /* MMX routine */
1684  SDL_imageFilterBitNegationMMX(Src1, Dest, length);
1685 
1686  /* Check for unaligned bytes */
1687  if ((length & 7) > 0) {
1688  /* Setup to process unaligned bytes */
1689  istart = length & 0xfffffff8;
1690  cursrc1 = &Src1[istart];
1691  curdst = &Dest[istart];
1692  } else {
1693  /* No unaligned bytes - we are done */
1694  return (0);
1695  }
1696  } else {
1697  /* Setup to process whole image */
1698  istart = 0;
1699  cursrc1 = Src1;
1700  curdst = Dest;
1701  }
1702 
1703  /* C routine to process image */
1704  for (i = istart; i < length; i++) {
1705  *curdst = ~(*cursrc1);
1706  /* Advance pointers */
1707  cursrc1++;
1708  curdst++;
1709  }
1710 
1711  return (0);
1712 }
1713 
1724 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
1725 {
1726 #ifdef USE_MMX
1727 #if !defined(GCC__)
1728  __asm
1729  {
1730  pusha
1731  /* ** Duplicate C in 8 bytes of MM1 ** */
1732  mov al, C /* load C into AL */
1733  mov ah, al /* copy AL into AH */
1734  mov bx, ax /* copy AX into BX */
1735  shl eax, 16 /* shift 2 bytes of EAX left */
1736  mov ax, bx /* copy BX into AX */
1737  movd mm1, eax /* copy EAX into MM1 */
1738  movd mm2, eax /* copy EAX into MM2 */
1739  punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
1740  mov eax, Src1 /* load Src1 address into eax */
1741  mov edi, Dest /* load Dest address into edi */
1742  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1743  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1744  align 16 /* 16 byte alignment of the loop entry */
1745 L1021:
1746  movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
1747  paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
1748  movq [edi], mm0 /* store result in Dest */
1749  add eax, 8 /* increase Dest register pointer by 8 */
1750  add edi, 8 /* increase Dest register pointer by 8 */
1751  dec ecx /* decrease loop counter */
1752  jnz L1021 /* check loop termination, proceed if required */
1753  emms /* exit MMX state */
1754  popa
1755  }
1756 #else
1757  /* i386 and x86_64 */
1758  __m64 *mSrc1 = (__m64*)Src1;
1759  __m64 *mDest = (__m64*)Dest;
1760  /* Duplicate C in 8 bytes of MM1 */
1761  int i;
1762  memset(&i, C, 4);
1763  __m64 mm1 = _m_from_int(i);
1764  __m64 mm2 = _m_from_int(i);
1765  mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
1766  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
1767  for (i = 0; i < SrcLength/8; i++) {
1768  *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */
1769  mSrc1++;
1770  mDest++;
1771  }
1772  _m_empty(); /* clean MMX state */
1773 #endif
1774  return (0);
1775 #else
1776  return (-1);
1777 #endif
1778 }
1779 
1791 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
1792 {
1793  unsigned int i, istart;
1794  int iC;
1795  unsigned char *cursrc1, *curdest;
1796  int result;
1797 
1798  /* Validate input parameters */
1799  if ((Src1 == NULL) || (Dest == NULL))
1800  return(-1);
1801  if (length == 0)
1802  return(0);
1803 
1804  /* Special case: C==0 */
1805  if (C == 0) {
1806  memcpy(Src1, Dest, length);
1807  return (0);
1808  }
1809 
1810  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1811 
1812  /* MMX routine */
1813  SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
1814 
1815  /* Check for unaligned bytes */
1816  if ((length & 7) > 0) {
1817  /* Setup to process unaligned bytes */
1818  istart = length & 0xfffffff8;
1819  cursrc1 = &Src1[istart];
1820  curdest = &Dest[istart];
1821  } else {
1822  /* No unaligned bytes - we are done */
1823  return (0);
1824  }
1825  } else {
1826  /* Setup to process whole image */
1827  istart = 0;
1828  cursrc1 = Src1;
1829  curdest = Dest;
1830  }
1831 
1832  /* C routine to process image */
1833  iC = (int) C;
1834  for (i = istart; i < length; i++) {
1835  result = (int) *cursrc1 + iC;
1836  if (result > 255)
1837  result = 255;
1838  *curdest = (unsigned char) result;
1839  /* Advance pointers */
1840  cursrc1++;
1841  curdest++;
1842  }
1843  return (0);
1844 }
1845 
1857 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
1858 {
1859 #ifdef USE_MMX
1860 #if !defined(GCC__)
1861  __asm
1862  {
1863  pusha
1864  /* ** Duplicate (int)C in 8 bytes of MM1 ** */
1865  mov eax, C /* load C into EAX */
1866  movd mm1, eax /* copy EAX into MM1 */
1867  mov eax, D /* load D into EAX */
1868  movd mm2, eax /* copy EAX into MM2 */
1869  punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
1870  mov eax, Src1 /* load Src1 address into eax */
1871  mov edi, Dest /* load Dest address into edi */
1872  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1873  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1874  align 16 /* 16 byte alignment of the loop entry */
1875 L11023:
1876  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
1877  paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
1878  movq [edi], mm0 /* store result in SrcDest */
1879  add eax, 8 /* increase Src1 register pointer by 8 */
1880  add edi, 8 /* increase Dest register pointer by 8 */
1881  dec ecx /* decrease loop counter */
1882  jnz L11023 /* check loop termination, proceed if required */
1883  emms /* exit MMX state */
1884  popa
1885  }
1886 #else
1887  /* i386 and x86_64 */
1888  __m64 *mSrc1 = (__m64*)Src1;
1889  __m64 *mDest = (__m64*)Dest;
1890  /* Duplicate (int)C in 8 bytes of MM1 */
1891  __m64 mm1 = _m_from_int(C);
1892  __m64 mm2 = _m_from_int(C);
1893  mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
1894  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
1895  int i;
1896  for (i = 0; i < SrcLength/8; i++) {
1897  *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */
1898  mSrc1++;
1899  mDest++;
1900  }
1901  _m_empty(); /* clean MMX state */
1902 #endif
1903  return (0);
1904 #else
1905  return (-1);
1906 #endif
1907 }
1908 
1919 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
1920 {
1921  unsigned int i, j, istart, D;
1922  int iC[4];
1923  unsigned char *cursrc1;
1924  unsigned char *curdest;
1925  int result;
1926 
1927  /* Validate input parameters */
1928  if ((Src1 == NULL) || (Dest == NULL))
1929  return(-1);
1930  if (length == 0)
1931  return(0);
1932 
1933  /* Special case: C==0 */
1934  if (C == 0) {
1935  memcpy(Src1, Dest, length);
1936  return (0);
1937  }
1938 
1939  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1940 
1941  /* MMX routine */
1942  D=SWAP_32(C);
1943  SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
1944 
1945  /* Check for unaligned bytes */
1946  if ((length & 7) > 0) {
1947  /* Setup to process unaligned bytes */
1948  istart = length & 0xfffffff8;
1949  cursrc1 = &Src1[istart];
1950  curdest = &Dest[istart];
1951  } else {
1952  /* No unaligned bytes - we are done */
1953  return (0);
1954  }
1955  } else {
1956  /* Setup to process whole image */
1957  istart = 0;
1958  cursrc1 = Src1;
1959  curdest = Dest;
1960  }
1961 
1962  /* C routine to process bytes */
1963  iC[3] = (int) ((C >> 24) & 0xff);
1964  iC[2] = (int) ((C >> 16) & 0xff);
1965  iC[1] = (int) ((C >> 8) & 0xff);
1966  iC[0] = (int) ((C >> 0) & 0xff);
1967  for (i = istart; i < length; i += 4) {
1968  for (j = 0; j < 4; j++) {
1969  if ((i+j)<length) {
1970  result = (int) *cursrc1 + iC[j];
1971  if (result > 255) result = 255;
1972  *curdest = (unsigned char) result;
1973  /* Advance pointers */
1974  cursrc1++;
1975  curdest++;
1976  }
1977  }
1978  }
1979  return (0);
1980 }
1981 
1993 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
1994  unsigned char *Mask)
1995 {
1996 #ifdef USE_MMX
1997 #if !defined(GCC__)
1998  __asm
1999  {
2000  pusha
2001  /* ** Duplicate C in 8 bytes of MM1 ** */
2002  mov al, C /* load C into AL */
2003  mov ah, al /* copy AL into AH */
2004  mov bx, ax /* copy AX into BX */
2005  shl eax, 16 /* shift 2 bytes of EAX left */
2006  mov ax, bx /* copy BX into AX */
2007  movd mm1, eax /* copy EAX into MM1 */
2008  movd mm2, eax /* copy EAX into MM2 */
2009  punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2010  mov edx, Mask /* load Mask address into edx */
2011  movq mm0, [edx] /* load Mask into mm0 */
2012  mov eax, Src1 /* load Src1 address into eax */
2013  mov edi, Dest /* load Dest address into edi */
2014  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2015  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2016  align 16 /* 16 byte alignment of the loop entry */
2017 L1022:
2018  movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */
2019  psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */
2020  pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */
2021  paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */
2022  movq [edi], mm2 /* store result in Dest */
2023  add eax, 8 /* increase Src1 register pointer by 8 */
2024  add edi, 8 /* increase Dest register pointer by 8 */
2025  dec ecx /* decrease loop counter */
2026  jnz L1022 /* check loop termination, proceed if required */
2027  emms /* exit MMX state */
2028  popa
2029  }
2030 #else
2031  /* i386 and x86_64 */
2032  __m64 *mSrc1 = (__m64*)Src1;
2033  __m64 *mDest = (__m64*)Dest;
2034  __m64 *mMask = (__m64*)Mask;
2035  /* Duplicate C in 8 bytes of MM1 */
2036  int i;
2037  memset(&i, C, 4);
2038  __m64 mm1 = _m_from_int(i);
2039  __m64 mm2 = _m_from_int(i);
2040  mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2041  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2042  for (i = 0; i < SrcLength/8; i++) {
2043  __m64 mm2 = _m_psrlwi(*mSrc1, 1); /* shift 4 WORDS of MM2 1 bit to the right */
2044  mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of MM2 */
2045  /* byte 0x0f, 0xdb, 0xd0 */
2046  *mDest = _m_paddusb(mm1, mm2); /* Src1+C (add 8 bytes with saturation) */
2047  mSrc1++;
2048  mDest++;
2049  }
2050  _m_empty(); /* clean MMX state */
2051 #endif
2052  return (0);
2053 #else
2054  return (-1);
2055 #endif
2056 }
2057 
2068 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2069 {
2070  static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2071  unsigned int i, istart;
2072  int iC;
2073  unsigned char *cursrc1;
2074  unsigned char *curdest;
2075  int result;
2076 
2077  /* Validate input parameters */
2078  if ((Src1 == NULL) || (Dest == NULL))
2079  return(-1);
2080  if (length == 0)
2081  return(0);
2082 
2083  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2084 
2085  /* MMX routine */
2086  SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
2087 
2088  /* Check for unaligned bytes */
2089  if ((length & 7) > 0) {
2090  /* Setup to process unaligned bytes */
2091  istart = length & 0xfffffff8;
2092  cursrc1 = &Src1[istart];
2093  curdest = &Dest[istart];
2094  } else {
2095  /* No unaligned bytes - we are done */
2096  return (0);
2097  }
2098  } else {
2099  /* Setup to process whole image */
2100  istart = 0;
2101  cursrc1 = Src1;
2102  curdest = Dest;
2103  }
2104 
2105  /* C routine to process image */
2106  iC = (int) C;
2107  for (i = istart; i < length; i++) {
2108  result = (int) (*cursrc1 / 2) + iC;
2109  if (result > 255)
2110  result = 255;
2111  *curdest = (unsigned char) result;
2112  /* Advance pointers */
2113  cursrc1++;
2114  curdest++;
2115  }
2116 
2117  return (0);
2118 }
2119 
2130 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2131 {
2132 #ifdef USE_MMX
2133 #if !defined(GCC__)
2134  __asm
2135  {
2136  pusha
2137  /* ** Duplicate C in 8 bytes of MM1 ** */
2138  mov al, C /* load C into AL */
2139  mov ah, al /* copy AL into AH */
2140  mov bx, ax /* copy AX into BX */
2141  shl eax, 16 /* shift 2 bytes of EAX left */
2142  mov ax, bx /* copy BX into AX */
2143  movd mm1, eax /* copy EAX into MM1 */
2144  movd mm2, eax /* copy EAX into MM2 */
2145  punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2146  mov eax, Src1 /* load Src1 address into eax */
2147  mov edi, Dest /* load Dest address into edi */
2148  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2149  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2150  align 16 /* 16 byte alignment of the loop entry */
2151 L1023:
2152  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2153  psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
2154  movq [edi], mm0 /* store result in SrcDest */
2155  add eax, 8 /* increase Src1 register pointer by 8 */
2156  add edi, 8 /* increase Dest register pointer by 8 */
2157  dec ecx /* decrease loop counter */
2158  jnz L1023 /* check loop termination, proceed if required */
2159  emms /* exit MMX state */
2160  popa
2161  }
2162 #else
2163  /* i386 and x86_64 */
2164  __m64 *mSrc1 = (__m64*)Src1;
2165  __m64 *mDest = (__m64*)Dest;
2166  /* Duplicate C in 8 bytes of MM1 */
2167  int i;
2168  memset(&i, C, 4);
2169  __m64 mm1 = _m_from_int(i);
2170  __m64 mm2 = _m_from_int(i);
2171  mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2172  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2173  for (i = 0; i < SrcLength/8; i++) {
2174  *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */
2175  mSrc1++;
2176  mDest++;
2177  }
2178  _m_empty(); /* clean MMX state */
2179 #endif
2180  return (0);
2181 #else
2182  return (-1);
2183 #endif
2184 }
2185 
2196 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2197 {
2198  unsigned int i, istart;
2199  int iC;
2200  unsigned char *cursrc1;
2201  unsigned char *curdest;
2202  int result;
2203 
2204  /* Validate input parameters */
2205  if ((Src1 == NULL) || (Dest == NULL))
2206  return(-1);
2207  if (length == 0)
2208  return(0);
2209 
2210  /* Special case: C==0 */
2211  if (C == 0) {
2212  memcpy(Src1, Dest, length);
2213  return (0);
2214  }
2215 
2216  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2217 
2218  /* MMX routine */
2219  SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
2220 
2221  /* Check for unaligned bytes */
2222  if ((length & 7) > 0) {
2223  /* Setup to process unaligned bytes */
2224  istart = length & 0xfffffff8;
2225  cursrc1 = &Src1[istart];
2226  curdest = &Dest[istart];
2227  } else {
2228  /* No unaligned bytes - we are done */
2229  return (0);
2230  }
2231  } else {
2232  /* Setup to process whole image */
2233  istart = 0;
2234  cursrc1 = Src1;
2235  curdest = Dest;
2236  }
2237 
2238  /* C routine to process image */
2239  iC = (int) C;
2240  for (i = istart; i < length; i++) {
2241  result = (int) *cursrc1 - iC;
2242  if (result < 0)
2243  result = 0;
2244  *curdest = (unsigned char) result;
2245  /* Advance pointers */
2246  cursrc1++;
2247  curdest++;
2248  }
2249  return (0);
2250 }
2251 
2263 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
2264 {
2265 #ifdef USE_MMX
2266 #if !defined(GCC__)
2267  __asm
2268  {
2269  pusha
2270  /* ** Duplicate (int)C in 8 bytes of MM1 ** */
2271  mov eax, C /* load C into EAX */
2272  movd mm1, eax /* copy EAX into MM1 */
2273  mov eax, D /* load D into EAX */
2274  movd mm2, eax /* copy EAX into MM2 */
2275  punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2276  mov eax, Src1 /* load Src1 address into eax */
2277  mov edi, Dest /* load Dest address into edi */
2278  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2279  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2280  align 16 /* 16 byte alignment of the loop entry */
2281 L11024:
2282  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2283  psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
2284  movq [edi], mm0 /* store result in SrcDest */
2285  add eax, 8 /* increase Src1 register pointer by 8 */
2286  add edi, 8 /* increase Dest register pointer by 8 */
2287  dec ecx /* decrease loop counter */
2288  jnz L11024 /* check loop termination, proceed if required */
2289  emms /* exit MMX state */
2290  popa
2291  }
2292 #else
2293  /* i386 and x86_64 */
2294  __m64 *mSrc1 = (__m64*)Src1;
2295  __m64 *mDest = (__m64*)Dest;
2296  /* Duplicate (int)C in 8 bytes of MM1 */
2297  __m64 mm1 = _m_from_int(C);
2298  __m64 mm2 = _m_from_int(C);
2299  mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2300  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2301  int i;
2302  for (i = 0; i < SrcLength/8; i++) {
2303  *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */
2304  mSrc1++;
2305  mDest++;
2306  }
2307  _m_empty(); /* clean MMX state */
2308 #endif
2309  return (0);
2310 #else
2311  return (-1);
2312 #endif
2313 }
2314 
2325 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
2326 {
2327  unsigned int i, j, istart, D;
2328  int iC[4];
2329  unsigned char *cursrc1;
2330  unsigned char *curdest;
2331  int result;
2332 
2333  /* Validate input parameters */
2334  if ((Src1 == NULL) || (Dest == NULL))
2335  return(-1);
2336  if (length == 0)
2337  return(0);
2338 
2339  /* Special case: C==0 */
2340  if (C == 0) {
2341  memcpy(Src1, Dest, length);
2342  return (0);
2343  }
2344 
2345  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2346 
2347  /* MMX routine */
2348  D=SWAP_32(C);
2349  SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
2350 
2351  /* Check for unaligned bytes */
2352  if ((length & 7) > 0) {
2353  /* Setup to process unaligned bytes */
2354  istart = length & 0xfffffff8;
2355  cursrc1 = &Src1[istart];
2356  curdest = &Dest[istart];
2357  } else {
2358  /* No unaligned bytes - we are done */
2359  return (0);
2360  }
2361  } else {
2362  /* Setup to process whole image */
2363  istart = 0;
2364  cursrc1 = Src1;
2365  curdest = Dest;
2366  }
2367 
2368  /* C routine to process image */
2369  iC[3] = (int) ((C >> 24) & 0xff);
2370  iC[2] = (int) ((C >> 16) & 0xff);
2371  iC[1] = (int) ((C >> 8) & 0xff);
2372  iC[0] = (int) ((C >> 0) & 0xff);
2373  for (i = istart; i < length; i += 4) {
2374  for (j = 0; j < 4; j++) {
2375  if ((i+j)<length) {
2376  result = (int) *cursrc1 - iC[j];
2377  if (result < 0) result = 0;
2378  *curdest = (unsigned char) result;
2379  /* Advance pointers */
2380  cursrc1++;
2381  curdest++;
2382  }
2383  }
2384  }
2385  return (0);
2386 }
2387 
2399 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2400  unsigned char *Mask)
2401 {
2402 #ifdef USE_MMX
2403 #if !defined(GCC__)
2404  __asm
2405  {
2406  pusha
2407  mov edx, Mask /* load Mask address into edx */
2408  movq mm0, [edx] /* load Mask into mm0 */
2409  xor ecx, ecx /* zero ECX */
2410  mov cl, N /* load loop counter (N) into CL */
2411  movd mm3, ecx /* copy (N) into MM3 */
2412  pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
2413 L10240: /* ** Prepare proper bit-Mask in MM1 ** */
2414  psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */
2415  pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
2416  /* byte 0x0f, 0xdb, 0xc8 */
2417  dec cl /* decrease loop counter */
2418  jnz L10240 /* check loop termination, proceed if required */
2419  /* ** Shift all bytes of the image ** */
2420  mov eax, Src1 /* load Src1 address into eax */
2421  mov edi, Dest /* load Dest address into edi */
2422  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2423  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2424  align 16 /* 16 byte alignment of the loop entry */
2425 L10241:
2426  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2427  psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */
2428  pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
2429  /* byte 0x0f, 0xdb, 0xc1 */
2430  movq [edi], mm0 /* store result in SrcDest */
2431  add eax, 8 /* increase Src1 register pointer by 8 */
2432  add edi, 8 /* increase Dest register pointer by 8 */
2433  dec ecx /* decrease loop counter */
2434  jnz L10241 /* check loop termination, proceed if required */
2435  emms /* exit MMX state */
2436  popa
2437  }
2438 #else
2439  /* i386 and x86_64 */
2440  __m64 *mSrc1 = (__m64*)Src1;
2441  __m64 *mDest = (__m64*)Dest;
2442  __m64 *mMask = (__m64*)Mask;
2443  __m64 mm1;
2444  int i;
2445  mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
2446  /* Prepare proper bit-Mask in MM1 */
2447  for (i = 0; i < N; i++) {
2448  mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the right */
2449  mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */
2450  }
2451  /* Shift all bytes of the image */
2452  for (i = 0; i < SrcLength/8; i++) {
2453  __m64 mm0 = _m_psrlwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the right */
2454  *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */
2455  mSrc1++;
2456  mDest++;
2457  }
2458  _m_empty(); /* clean MMX state */
2459 #endif
2460  return (0);
2461 #else
2462  return (-1);
2463 #endif
2464 }
2465 
2476 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2477 {
2478  static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2479  unsigned int i, istart;
2480  unsigned char *cursrc1;
2481  unsigned char *curdest;
2482 
2483  /* Validate input parameters */
2484  if ((Src1 == NULL) || (Dest == NULL))
2485  return(-1);
2486  if (length == 0)
2487  return(0);
2488 
2489  /* Check shift */
2490  if (N > 8) {
2491  return (-1);
2492  }
2493 
2494  /* Special case: N==0 */
2495  if (N == 0) {
2496  memcpy(Src1, Dest, length);
2497  return (0);
2498  }
2499 
2500  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2501 
2502  /* MMX routine */
2503  SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
2504 
2505  /* Check for unaligned bytes */
2506  if ((length & 7) > 0) {
2507  /* Setup to process unaligned bytes */
2508  istart = length & 0xfffffff8;
2509  cursrc1 = &Src1[istart];
2510  curdest = &Dest[istart];
2511  } else {
2512  /* No unaligned bytes - we are done */
2513  return (0);
2514  }
2515  } else {
2516  /* Setup to process whole image */
2517  istart = 0;
2518  cursrc1 = Src1;
2519  curdest = Dest;
2520  }
2521 
2522  /* C routine to process image */
2523  for (i = istart; i < length; i++) {
2524  *curdest = (unsigned char) *cursrc1 >> N;
2525  /* Advance pointers */
2526  cursrc1++;
2527  curdest++;
2528  }
2529 
2530  return (0);
2531 }
2532 
2543 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
2544 {
2545 #ifdef USE_MMX
2546 #if !defined(GCC__)
2547  __asm
2548  {
2549  pusha
2550  mov eax, Src1 /* load Src1 address into eax */
2551  mov edi, Dest /* load Dest address into edi */
2552  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2553  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2554  align 16 /* 16 byte alignment of the loop entry */
2555 L13023:
2556  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2557  psrld mm0, N
2558  movq [edi], mm0 /* store result in SrcDest */
2559  add eax, 8 /* increase Src1 register pointer by 8 */
2560  add edi, 8 /* increase Dest register pointer by 8 */
2561  dec ecx /* decrease loop counter */
2562  jnz L13023 /* check loop termination, proceed if required */
2563  emms /* exit MMX state */
2564  popa
2565  }
2566 #else
2567  /* i386 and x86_64 */
2568  __m64 *mSrc1 = (__m64*)Src1;
2569  __m64 *mDest = (__m64*)Dest;
2570  int i;
2571  for (i = 0; i < SrcLength/8; i++) {
2572  *mDest = _m_psrldi(*mSrc1, N);
2573  mSrc1++;
2574  mDest++;
2575  }
2576  _m_empty(); /* clean MMX state */
2577 #endif
2578  return (0);
2579 #else
2580  return (-1);
2581 #endif
2582 }
2583 
2594 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2595 {
2596  unsigned int i, istart;
2597  unsigned char *cursrc1, *curdest;
2598  unsigned int *icursrc1, *icurdest;
2599  unsigned int result;
2600 
2601  /* Validate input parameters */
2602  if ((Src1 == NULL) || (Dest == NULL))
2603  return(-1);
2604  if (length == 0)
2605  return(0);
2606 
2607  if (N > 32) {
2608  return (-1);
2609  }
2610 
2611  /* Special case: N==0 */
2612  if (N == 0) {
2613  memcpy(Src1, Dest, length);
2614  return (0);
2615  }
2616 
2617  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2618 
2619  SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
2620 
2621  /* Check for unaligned bytes */
2622  if ((length & 7) > 0) {
2623  /* Setup to process unaligned bytes */
2624  istart = length & 0xfffffff8;
2625  cursrc1 = &Src1[istart];
2626  curdest = &Dest[istart];
2627  } else {
2628  /* No unaligned bytes - we are done */
2629  return (0);
2630  }
2631  } else {
2632  /* Setup to process whole image */
2633  istart = 0;
2634  cursrc1 = Src1;
2635  curdest = Dest;
2636  }
2637 
2638  /* C routine to process image */
2639  icursrc1=(unsigned int *)cursrc1;
2640  icurdest=(unsigned int *)curdest;
2641  for (i = istart; i < length; i += 4) {
2642  if ((i+4)<length) {
2643  result = ((unsigned int)*icursrc1 >> N);
2644  *icurdest = result;
2645  }
2646  /* Advance pointers */
2647  icursrc1++;
2648  icurdest++;
2649  }
2650 
2651  return (0);
2652 }
2653 
2664 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2665 {
2666 #ifdef USE_MMX
2667 #if !defined(GCC__)
2668  __asm
2669  {
2670  pusha
2671  /* ** Duplicate C in 4 words of MM1 ** */
2672  mov al, C /* load C into AL */
2673  xor ah, ah /* zero AH */
2674  mov bx, ax /* copy AX into BX */
2675  shl eax, 16 /* shift 2 bytes of EAX left */
2676  mov ax, bx /* copy BX into AX */
2677  movd mm1, eax /* copy EAX into MM1 */
2678  movd mm2, eax /* copy EAX into MM2 */
2679  punpckldq mm1, mm2 /* fill higher words of MM1 with C */
2680  pxor mm0, mm0 /* zero MM0 register */
2681  mov eax, Src1 /* load Src1 address into eax */
2682  mov edi, Dest /* load Dest address into edi */
2683  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2684  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2685  cmp al, 128 /* if (C <= 128) execute more efficient code */
2686  jg L10251
2687  align 16 /* 16 byte alignment of the loop entry */
2688 L10250:
2689  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2690  movq mm4, mm3 /* copy MM3 into MM4 */
2691  punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2692  punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2693  pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
2694  pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
2695  packuswb mm3, mm4 /* pack words back into bytes with saturation */
2696  movq [edi], mm3 /* store result in Dest */
2697  add eax, 8 /* increase Src1 register pointer by 8 */
2698  add edi, 8 /* increase Dest register pointer by 8 */
2699  dec ecx /* decrease loop counter */
2700  jnz L10250 /* check loop termination, proceed if required */
2701  jmp L10252
2702  align 16 /* 16 byte alignment of the loop entry */
2703 L10251:
2704  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2705  movq mm4, mm3 /* copy MM3 into MM4 */
2706  punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2707  punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2708  pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
2709  pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
2710  /* ** Take abs value of the results (signed words) ** */
2711  movq mm5, mm3 /* copy mm3 into mm5 */
2712  movq mm6, mm4 /* copy mm4 into mm6 */
2713  psraw mm5, 15 /* fill mm5 words with word sign bit */
2714  psraw mm6, 15 /* fill mm6 words with word sign bit */
2715  pxor mm3, mm5 /* take 1's compliment of only neg words */
2716  pxor mm4, mm6 /* take 1's compliment of only neg words */
2717  psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
2718  psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
2719  packuswb mm3, mm4 /* pack words back into bytes with saturation */
2720  movq [edi], mm3 /* store result in Dest */
2721  add eax, 8 /* increase Src1 register pointer by 8 */
2722  add edi, 8 /* increase Dest register pointer by 8 */
2723  dec ecx /* decrease loop counter */
2724  jnz L10251 /* check loop termination, proceed if required */
2725 L10252:
2726  emms /* exit MMX state */
2727  popa
2728  }
2729 #else
2730  /* i386 and x86_64 */
2731  __m64 *mSrc1 = (__m64*)Src1;
2732  __m64 *mDest = (__m64*)Dest;
2733  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
2734  /* Duplicate C in 4 words of MM1 */
2735  int i;
2736  i = C | C<<16;
2737  __m64 mm1 = _m_from_int(i);
2738  __m64 mm2 = _m_from_int(i);
2739  mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */
2740  // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
2741  //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2742  if (C <= 128) { /* if (C <= 128) execute more efficient code */
2743  for (i = 0; i < SrcLength/8; i++) {
2744  __m64 mm3, mm4;
2745  mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2746  mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2747  mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2748  mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2749  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2750  mSrc1++;
2751  mDest++;
2752  }
2753  } else {
2754  for (i = 0; i < SrcLength/8; i++) {
2755  __m64 mm3, mm4, mm5, mm6;
2756  mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2757  mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2758  mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2759  mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2760  /* Take abs value of the results (signed words) */
2761  mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
2762  mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
2763  mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
2764  mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
2765  mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
2766  mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
2767  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2768  mSrc1++;
2769  mDest++;
2770  }
2771  }
2772  _m_empty(); /* clean MMX state */
2773 #endif
2774  return (0);
2775 #else
2776  return (-1);
2777 #endif
2778 }
2779 
2790 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2791 {
2792  unsigned int i, istart;
2793  int iC;
2794  unsigned char *cursrc1;
2795  unsigned char *curdest;
2796  int result;
2797 
2798  /* Validate input parameters */
2799  if ((Src1 == NULL) || (Dest == NULL))
2800  return(-1);
2801  if (length == 0)
2802  return(0);
2803 
2804  /* Special case: C==1 */
2805  if (C == 1) {
2806  memcpy(Src1, Dest, length);
2807  return (0);
2808  }
2809 
2810  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2811 
2812  SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
2813 
2814  /* Check for unaligned bytes */
2815  if ((length & 7) > 0) {
2816  /* Setup to process unaligned bytes */
2817  istart = length & 0xfffffff8;
2818  cursrc1 = &Src1[istart];
2819  curdest = &Dest[istart];
2820  } else {
2821  /* No unaligned bytes - we are done */
2822  return (0);
2823  }
2824  } else {
2825  /* Setup to process whole image */
2826  istart = 0;
2827  cursrc1 = Src1;
2828  curdest = Dest;
2829  }
2830 
2831  /* C routine to process image */
2832  iC = (int) C;
2833  for (i = istart; i < length; i++) {
2834  result = (int) *cursrc1 * iC;
2835  if (result > 255)
2836  result = 255;
2837  *curdest = (unsigned char) result;
2838  /* Advance pointers */
2839  cursrc1++;
2840  curdest++;
2841  }
2842 
2843  return (0);
2844 }
2845 
2857 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2858  unsigned char C)
2859 {
2860 #ifdef USE_MMX
2861 #if !defined(GCC__)
2862  __asm
2863  {
2864  pusha
2865  /* ** Duplicate C in 4 words of MM1 ** */
2866  mov al, C /* load C into AL */
2867  xor ah, ah /* zero AH */
2868  mov bx, ax /* copy AX into BX */
2869  shl eax, 16 /* shift 2 bytes of EAX left */
2870  mov ax, bx /* copy BX into AX */
2871  movd mm1, eax /* copy EAX into MM1 */
2872  movd mm2, eax /* copy EAX into MM2 */
2873  punpckldq mm1, mm2 /* fill higher words of MM1 with C */
2874  xor ecx, ecx /* zero ECX */
2875  mov cl, N /* load N into CL */
2876  movd mm7, ecx /* copy N into MM7 */
2877  pxor mm0, mm0 /* zero MM0 register */
2878  mov eax, Src1 /* load Src1 address into eax */
2879  mov edi, Dest /* load Dest address into edi */
2880  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2881  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2882  align 16 /* 16 byte alignment of the loop entry */
2883 L1026:
2884  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2885  movq mm4, mm3 /* copy MM3 into MM4 */
2886  punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2887  punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2888  psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */
2889  psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */
2890  pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */
2891  pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */
2892  packuswb mm3, mm4 /* pack words back into bytes with saturation */
2893  movq [edi], mm3 /* store result in Dest */
2894  add eax, 8 /* increase Src1 register pointer by 8 */
2895  add edi, 8 /* increase Dest register pointer by 8 */
2896  dec ecx /* decrease loop counter */
2897  jnz L1026 /* check loop termination, proceed if required */
2898  emms /* exit MMX state */
2899  popa
2900  }
2901 #else
2902  /* i386 and x86_64 */
2903  __m64 *mSrc1 = (__m64*)Src1;
2904  __m64 *mDest = (__m64*)Dest;
2905  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
2906  /* Duplicate C in 4 words of MM1 */
2907  int i;
2908  i = (C<<16)|C;
2909  __m64 mm1 = _m_from_int(i);
2910  __m64 mm2 = _m_from_int(i);
2911  mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */
2912  for (i = 0; i < SrcLength/8; i++) {
2913  __m64 mm3, mm4, mm5, mm6;
2914  mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2915  mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2916  mm3 = _m_psrlwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the right */
2917  mm4 = _m_psrlwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the right */
2918  mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2919  mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2920  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2921  mSrc1++;
2922  mDest++;
2923  }
2924  _m_empty(); /* clean MMX state */
2925 #endif
2926  return (0);
2927 #else
2928  return (-1);
2929 #endif
2930 }
2931 
2943 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
2944  unsigned char C)
2945 {
2946  unsigned int i, istart;
2947  int iC;
2948  unsigned char *cursrc1;
2949  unsigned char *curdest;
2950  int result;
2951 
2952  /* Validate input parameters */
2953  if ((Src1 == NULL) || (Dest == NULL))
2954  return(-1);
2955  if (length == 0)
2956  return(0);
2957 
2958  /* Check shift */
2959  if (N > 8) {
2960  return (-1);
2961  }
2962 
2963  /* Special case: N==0 && C==1 */
2964  if ((N == 0) && (C == 1)) {
2965  memcpy(Src1, Dest, length);
2966  return (0);
2967  }
2968 
2969  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2970 
2971  SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
2972 
2973  /* Check for unaligned bytes */
2974  if ((length & 7) > 0) {
2975  /* Setup to process unaligned bytes */
2976  istart = length & 0xfffffff8;
2977  cursrc1 = &Src1[istart];
2978  curdest = &Dest[istart];
2979  } else {
2980  /* No unaligned bytes - we are done */
2981  return (0);
2982  }
2983  } else {
2984  /* Setup to process whole image */
2985  istart = 0;
2986  cursrc1 = Src1;
2987  curdest = Dest;
2988  }
2989 
2990  /* C routine to process image */
2991  iC = (int) C;
2992  for (i = istart; i < length; i++) {
2993  result = (int) (*cursrc1 >> N) * iC;
2994  if (result > 255)
2995  result = 255;
2996  *curdest = (unsigned char) result;
2997  /* Advance pointers */
2998  cursrc1++;
2999  curdest++;
3000  }
3001 
3002  return (0);
3003 }
3004 
3016 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
3017  unsigned char *Mask)
3018 {
3019 #ifdef USE_MMX
3020 #if !defined(GCC__)
3021  __asm
3022  {
3023  pusha
3024  mov edx, Mask /* load Mask address into edx */
3025  movq mm0, [edx] /* load Mask into mm0 */
3026  xor ecx, ecx /* zero ECX */
3027  mov cl, N /* load loop counter (N) into CL */
3028  movd mm3, ecx /* copy (N) into MM3 */
3029  pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3030 L10270: /* ** Prepare proper bit-Mask in MM1 ** */
3031  psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */
3032  pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
3033  /* byte 0x0f, 0xdb, 0xc8 */
3034  dec cl /* decrease loop counter */
3035  jnz L10270 /* check loop termination, proceed if required */
3036  /* ** Shift all bytes of the image ** */
3037  mov eax, Src1 /* load Src1 address into eax */
3038  mov edi, Dest /* load SrcDest address into edi */
3039  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3040  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3041  align 16 /* 16 byte alignment of the loop entry */
3042 L10271:
3043  movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
3044  psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */
3045  pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
3046  /* byte 0x0f, 0xdb, 0xc1 */
3047  movq [edi], mm0 /* store result in Dest */
3048  add eax, 8 /* increase Src1 register pointer by 8 */
3049  add edi, 8 /* increase Dest register pointer by 8 */
3050  dec ecx /* decrease loop counter */
3051  jnz L10271 /* check loop termination, proceed if required */
3052  emms /* exit MMX state */
3053  popa
3054  }
3055 #else
3056  /* i386 and x86_64 */
3057  __m64 *mSrc1 = (__m64*)Src1;
3058  __m64 *mDest = (__m64*)Dest;
3059  __m64 *mMask = (__m64*)Mask;
3060  __m64 mm1;
3061  int i;
3062  mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3063  /* Prepare proper bit-Mask in MM1 */
3064  for (i = 0; i < N; i++) {
3065  mm1 = _m_psllwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the left */
3066  mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */
3067  }
3068  /* ** Shift all bytes of the image ** */
3069  for (i = 0; i < SrcLength/8; i++) {
3070  __m64 mm0 = _m_psllwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the left */
3071  *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */
3072  mSrc1++;
3073  mDest++;
3074  }
3075  _m_empty(); /* clean MMX state */
3076 #endif
3077  return (0);
3078 #else
3079  return (-1);
3080 #endif
3081 }
3082 
3093 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3094 {
3095  static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
3096  unsigned int i, istart;
3097  unsigned char *cursrc1, *curdest;
3098  int result;
3099 
3100  /* Validate input parameters */
3101  if ((Src1 == NULL) || (Dest == NULL))
3102  return(-1);
3103  if (length == 0)
3104  return(0);
3105 
3106  if (N > 8) {
3107  return (-1);
3108  }
3109 
3110  /* Special case: N==0 */
3111  if (N == 0) {
3112  memcpy(Src1, Dest, length);
3113  return (0);
3114  }
3115 
3116  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3117 
3118  SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
3119 
3120  /* Check for unaligned bytes */
3121  if ((length & 7) > 0) {
3122  /* Setup to process unaligned bytes */
3123  istart = length & 0xfffffff8;
3124  cursrc1 = &Src1[istart];
3125  curdest = &Dest[istart];
3126  } else {
3127  /* No unaligned bytes - we are done */
3128  return (0);
3129  }
3130  } else {
3131  /* Setup to process whole image */
3132  istart = 0;
3133  cursrc1 = Src1;
3134  curdest = Dest;
3135  }
3136 
3137  /* C routine to process image */
3138  for (i = istart; i < length; i++) {
3139  result = ((int) *cursrc1 << N) & 0xff;
3140  *curdest = (unsigned char) result;
3141  /* Advance pointers */
3142  cursrc1++;
3143  curdest++;
3144  }
3145 
3146  return (0);
3147 }
3148 
3159 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3160 {
3161 #ifdef USE_MMX
3162 #if !defined(GCC__)
3163  __asm
3164  {
3165  pusha
3166  mov eax, Src1 /* load Src1 address into eax */
3167  mov edi, Dest /* load Dest address into edi */
3168  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3169  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3170  align 16 /* 16 byte alignment of the loop entry */
3171 L12023:
3172  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
3173  pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */
3174  movq [edi], mm0 /* store result in SrcDest */
3175  add eax, 8 /* increase Src1 register pointer by 8 */
3176  add edi, 8 /* increase Dest register pointer by 8 */
3177  dec ecx /* decrease loop counter */
3178  jnz L12023 /* check loop termination, proceed if required */
3179  emms /* exit MMX state */
3180  popa
3181  }
3182 #else
3183  /* i386 and x86_64 */
3184  __m64 *mSrc1 = (__m64*)Src1;
3185  __m64 *mDest = (__m64*)Dest;
3186  int i;
3187  for (i = 0; i < SrcLength/8; i++) {
3188  *mDest = _m_pslldi(*mSrc1, N); /* Src1+C (add 8 bytes with saturation) */
3189  mSrc1++;
3190  mDest++;
3191  }
3192  _m_empty(); /* clean MMX state */
3193 #endif
3194  return (0);
3195 #else
3196  return (-1);
3197 #endif
3198 }
3199 
3210 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3211 {
3212  unsigned int i, istart;
3213  unsigned char *cursrc1, *curdest;
3214  unsigned int *icursrc1, *icurdest;
3215  unsigned int result;
3216 
3217  /* Validate input parameters */
3218  if ((Src1 == NULL) || (Dest == NULL))
3219  return(-1);
3220  if (length == 0)
3221  return(0);
3222 
3223  if (N > 32) {
3224  return (-1);
3225  }
3226 
3227  /* Special case: N==0 */
3228  if (N == 0) {
3229  memcpy(Src1, Dest, length);
3230  return (0);
3231  }
3232 
3233  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3234 
3235  SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
3236 
3237  /* Check for unaligned bytes */
3238  if ((length & 7) > 0) {
3239  /* Setup to process unaligned bytes */
3240  istart = length & 0xfffffff8;
3241  cursrc1 = &Src1[istart];
3242  curdest = &Dest[istart];
3243  } else {
3244  /* No unaligned bytes - we are done */
3245  return (0);
3246  }
3247  } else {
3248  /* Setup to process whole image */
3249  istart = 0;
3250  cursrc1 = Src1;
3251  curdest = Dest;
3252  }
3253 
3254  /* C routine to process image */
3255  icursrc1=(unsigned int *)cursrc1;
3256  icurdest=(unsigned int *)curdest;
3257  for (i = istart; i < length; i += 4) {
3258  if ((i+4)<length) {
3259  result = ((unsigned int)*icursrc1 << N);
3260  *icurdest = result;
3261  }
3262  /* Advance pointers */
3263  icursrc1++;
3264  icurdest++;
3265  }
3266 
3267  return (0);
3268 }
3269 
3280 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3281 {
3282 #ifdef USE_MMX
3283 #if !defined(GCC__)
3284  __asm
3285  {
3286  pusha
3287  xor eax, eax /* zero EAX */
3288  mov al, N /* load N into AL */
3289  movd mm7, eax /* copy N into MM7 */
3290  pxor mm0, mm0 /* zero MM0 register */
3291  mov eax, Src1 /* load Src1 address into eax */
3292  mov edi, Dest /* load Dest address into edi */
3293  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3294  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3295  cmp al, 7 /* if (N <= 7) execute more efficient code */
3296  jg L10281
3297  align 16 /* 16 byte alignment of the loop entry */
3298 L10280:
3299  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3300  movq mm4, mm3 /* copy MM3 into MM4 */
3301  punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
3302  punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
3303  psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */
3304  psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */
3305  packuswb mm3, mm4 /* pack words back into bytes with saturation */
3306  movq [edi], mm3 /* store result in Dest */
3307  add eax, 8 /* increase Src1 register pointer by 8 */
3308  add edi, 8 /* increase Dest register pointer by 8 */
3309  dec ecx /* decrease loop counter */
3310  jnz L10280 /* check loop termination, proceed if required */
3311  jmp L10282
3312  align 16 /* 16 byte alignment of the loop entry */
3313 L10281:
3314  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3315  movq mm4, mm3 /* copy MM3 into MM4 */
3316  punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
3317  punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
3318  psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */
3319  psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */
3320  /* ** Take abs value of the signed words ** */
3321  movq mm5, mm3 /* copy mm3 into mm5 */
3322  movq mm6, mm4 /* copy mm4 into mm6 */
3323  psraw mm5, 15 /* fill mm5 words with word sign bit */
3324  psraw mm6, 15 /* fill mm6 words with word sign bit */
3325  pxor mm3, mm5 /* take 1's compliment of only neg words */
3326  pxor mm4, mm6 /* take 1's compliment of only neg words */
3327  psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
3328  psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
3329  packuswb mm3, mm4 /* pack words back into bytes with saturation */
3330  movq [edi], mm3 /* store result in Dest */
3331  add eax, 8 /* increase Src1 register pointer by 8 */
3332  add edi, 8 /* increase Dest register pointer by 8 */
3333  dec ecx /* decrease loop counter */
3334  jnz L10281 /* check loop termination, proceed if required */
3335 L10282:
3336  emms /* exit MMX state */
3337  popa
3338  }
3339 #else
3340  /* i386 and x86_64 */
3341  __m64 *mSrc1 = (__m64*)Src1;
3342  __m64 *mDest = (__m64*)Dest;
3343  __m64 mm0 = _m_from_int(0); /* zero mm0 register */
3344  int i;
3345  if (N <= 7) { /* if (N <= 7) execute more efficient code */
3346  for (i = 0; i < SrcLength/8; i++) {
3347  __m64 mm3, mm4;
3348  mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
3349  mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
3350  mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */
3351  mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */
3352  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3353  mSrc1++;
3354  mDest++;
3355  }
3356  } else {
3357  for (i = 0; i < SrcLength/8; i++) {
3358  __m64 mm3, mm4, mm5, mm6;
3359  mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
3360  mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
3361  mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */
3362  mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */
3363  /* Take abs value of the signed words */
3364  mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
3365  mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
3366  mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
3367  mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
3368  mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
3369  mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
3370  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3371  mSrc1++;
3372  mDest++;
3373  }
3374  }
3375  _m_empty(); /* clean MMX state */
3376 #endif
3377  return (0);
3378 #else
3379  return (-1);
3380 #endif
3381 }
3382 
3393 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3394 {
3395  unsigned int i, istart;
3396  unsigned char *cursrc1, *curdest;
3397  int result;
3398 
3399  /* Validate input parameters */
3400  if ((Src1 == NULL) || (Dest == NULL))
3401  return(-1);
3402  if (length == 0)
3403  return(0);
3404 
3405  if (N > 8) {
3406  return (-1);
3407  }
3408 
3409  /* Special case: N==0 */
3410  if (N == 0) {
3411  memcpy(Src1, Dest, length);
3412  return (0);
3413  }
3414 
3415  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3416 
3417  SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
3418 
3419  /* Check for unaligned bytes */
3420  if ((length & 7) > 0) {
3421  /* Setup to process unaligned bytes */
3422  istart = length & 0xfffffff8;
3423  cursrc1 = &Src1[istart];
3424  curdest = &Dest[istart];
3425  } else {
3426  /* No unaligned bytes - we are done */
3427  return (0);
3428  }
3429  } else {
3430  /* Setup to process whole image */
3431  istart = 0;
3432  cursrc1 = Src1;
3433  curdest = Dest;
3434  }
3435 
3436  /* C routine to process image */
3437  for (i = istart; i < length; i++) {
3438  result = (int) *cursrc1 << N;
3439  if (result > 255)
3440  result = 255;
3441  *curdest = (unsigned char) result;
3442  /* Advance pointers */
3443  cursrc1++;
3444  curdest++;
3445  }
3446 
3447  return (0);
3448 }
3449 
3460 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
3461 {
3462 #ifdef USE_MMX
3463 #if !defined(GCC__)
3464  __asm
3465  {
3466  pusha
3467  /* ** Duplicate T in 8 bytes of MM3 ** */
3468  pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3469  pcmpeqb mm2, mm2 /* generate all 1's in mm2 */
3470  mov al, T /* load T into AL */
3471  mov ah, al /* copy AL into AH */
3472  mov bx, ax /* copy AX into BX */
3473  shl eax, 16 /* shift 2 bytes of EAX left */
3474  mov ax, bx /* copy BX into AX */
3475  movd mm3, eax /* copy EAX into MM3 */
3476  movd mm4, eax /* copy EAX into MM4 */
3477  punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */
3478  psubusb mm2, mm3 /* store 0xFF - T in MM2 */
3479  mov eax, Src1 /* load Src1 address into eax */
3480  mov edi, Dest /* load Dest address into edi */
3481  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3482  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3483  align 16 /* 16 byte alignment of the loop entry */
3484 L1029:
3485  movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
3486  paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
3487  pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */
3488  movq [edi], mm0 /* store result in SrcDest */
3489  add eax, 8 /* increase Src1 register pointer by 8 */
3490  add edi, 8 /* increase Dest register pointer by 8 */
3491  dec ecx /* decrease loop counter */
3492  jnz L1029 /* check loop termination, proceed if required */
3493  emms /* exit MMX state */
3494  popa
3495  }
3496 #else
3497  /* i386 and x86_64 */
3498  __m64 *mSrc1 = (__m64*)Src1;
3499  __m64 *mDest = (__m64*)Dest;
3500  /* Duplicate T in 8 bytes of MM3 */
3501  __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3502  __m64 mm2 = _m_pcmpeqb(mm2, mm2); /* generate all 1's in mm1 */
3503  int i;
3504  memset(&i, T, 4);
3505  __m64 mm3 = _m_from_int(i);
3506  __m64 mm4 = _m_from_int(i);
3507  mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with T */
3508  mm2 = _m_psubusb(mm2, mm3); /* store 0xFF - T in MM2 */
3509  //__m64 mm3 = _m_from_int64(lli); // x86_64 only
3510  for (i = 0; i < SrcLength/8; i++) {
3511  __m64 mm0 = _m_paddusb(*mSrc1, mm2); /* Src1+(0xFF-T) (add 8 bytes with saturation) */
3512  *mDest = _m_pcmpeqb(mm0, mm1); /* binarize 255:0, comparing to 255 */
3513  mSrc1++;
3514  mDest++;
3515  }
3516  _m_empty(); /* clean MMX state */
3517 #endif
3518  return (0);
3519 #else
3520  return (-1);
3521 #endif
3522 }
3523 
3534 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
3535 {
3536  unsigned int i, istart;
3537  unsigned char *cursrc1;
3538  unsigned char *curdest;
3539 
3540  /* Validate input parameters */
3541  if ((Src1 == NULL) || (Dest == NULL))
3542  return(-1);
3543  if (length == 0)
3544  return(0);
3545 
3546  /* Special case: T==0 */
3547  if (T == 0) {
3548  memset(Dest, 255, length);
3549  return (0);
3550  }
3551 
3552  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3553 
3554  SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
3555 
3556  /* Check for unaligned bytes */
3557  if ((length & 7) > 0) {
3558  /* Setup to process unaligned bytes */
3559  istart = length & 0xfffffff8;
3560  cursrc1 = &Src1[istart];
3561  curdest = &Dest[istart];
3562  } else {
3563  /* No unaligned bytes - we are done */
3564  return (0);
3565  }
3566  } else {
3567  /* Setup to process whole image */
3568  istart = 0;
3569  cursrc1 = Src1;
3570  curdest = Dest;
3571  }
3572 
3573  /* C routine to process image */
3574  for (i = istart; i < length; i++) {
3575  *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
3576  /* Advance pointers */
3577  cursrc1++;
3578  curdest++;
3579  }
3580 
3581  return (0);
3582 }
3583 
3595 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
3596  unsigned char Tmax)
3597 {
3598 #ifdef USE_MMX
3599 #if !defined(GCC__)
3600  __asm
3601  {
3602  pusha
3603  pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3604  /* ** Duplicate Tmax in 8 bytes of MM3 ** */
3605  mov al, Tmax /* load Tmax into AL */
3606  mov ah, al /* copy AL into AH */
3607  mov bx, ax /* copy AX into BX */
3608  shl eax, 16 /* shift 2 bytes of EAX left */
3609  mov ax, bx /* copy BX into AX */
3610  movd mm3, eax /* copy EAX into MM3 */
3611  movd mm4, eax /* copy EAX into MM4 */
3612  punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */
3613  psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */
3614  /* ** Duplicate Tmin in 8 bytes of MM5 ** */
3615  mov al, Tmin /* load Tmin into AL */
3616  mov ah, al /* copy AL into AH */
3617  mov bx, ax /* copy AX into BX */
3618  shl eax, 16 /* shift 2 bytes of EAX left */
3619  mov ax, bx /* copy BX into AX */
3620  movd mm5, eax /* copy EAX into MM5 */
3621  movd mm4, eax /* copy EAX into MM4 */
3622  punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */
3623  movq mm7, mm5 /* copy MM5 into MM7 */
3624  paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */
3625  mov eax, Src1 /* load Src1 address into eax */
3626  mov edi, Dest /* load Dest address into edi */
3627  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3628  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3629  align 16 /* 16 byte alignment of the loop entry */
3630 L1030:
3631  movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
3632  paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */
3633  psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */
3634  paddusb mm0, mm5 /* MM0=MM0+Tmin */
3635  movq [edi], mm0 /* store result in Dest */
3636  add eax, 8 /* increase Src1 register pointer by 8 */
3637  add edi, 8 /* increase Dest register pointer by 8 */
3638  dec ecx /* decrease loop counter */
3639  jnz L1030 /* check loop termination, proceed if required */
3640  emms /* exit MMX state */
3641  popa
3642  }
3643 #else
3644  /* i386 and x86_64 */
3645  __m64 *mSrc1 = (__m64*)Src1;
3646  __m64 *mDest = (__m64*)Dest;
3647  __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3648  int i;
3649  /* Duplicate Tmax in 8 bytes of MM3 */
3650  __m64 mm3, mm4;
3651  memset(&i, Tmax, 4);
3652  mm3 = _m_from_int(i);
3653  mm4 = _m_from_int(i);
3654  mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with Tmax */
3655  mm1 = _m_psubusb(mm1, mm3); /* store 0xFF - Tmax in MM1 */
3656  //__m64 mm3 = _m_from_int64(lli); // x86_64 only
3657  /* Duplicate Tmax in 8 bytes of MM3 */
3658  __m64 mm5, mm7;
3659  memset(&i, Tmin, 4);
3660  mm5 = _m_from_int(i);
3661  mm4 = _m_from_int(i);
3662  mm5 = _m_punpckldq(mm5, mm4); /* fill higher bytes of MM5 with Tmin */
3663  mm7 = _m_paddusb(mm5, mm1); /* store 0xFF - Tmax + Tmin in MM7 */
3664  for (i = 0; i < SrcLength/8; i++) {
3665  __m64 mm0;
3666  mm0 = _m_paddusb(*mSrc1, mm1); /* MM0=Src1+(0xFF-Tmax) */
3667  mm0 = _m_psubusb(mm0, mm7); /* MM0=MM0-(0xFF-Tmax+Tmin) */
3668  *mDest = _m_paddusb(mm0, mm5); /* MM0+Tmin */
3669  mSrc1++;
3670  mDest++;
3671  }
3672  _m_empty(); /* clean MMX state */
3673 #endif
3674  return (0);
3675 #else
3676  return (-1);
3677 #endif
3678 }
3679 
3691 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
3692  unsigned char Tmax)
3693 {
3694  unsigned int i, istart;
3695  unsigned char *cursrc1;
3696  unsigned char *curdest;
3697 
3698  /* Validate input parameters */
3699  if ((Src1 == NULL) || (Dest == NULL))
3700  return(-1);
3701  if (length == 0)
3702  return(0);
3703 
3704  /* Special case: Tmin==0 && Tmax = 255 */
3705  if ((Tmin == 0) && (Tmax == 25)) {
3706  memcpy(Src1, Dest, length);
3707  return (0);
3708  }
3709 
3710  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3711 
3712  SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
3713 
3714  /* Check for unaligned bytes */
3715  if ((length & 7) > 0) {
3716  /* Setup to process unaligned bytes */
3717  istart = length & 0xfffffff8;
3718  cursrc1 = &Src1[istart];
3719  curdest = &Dest[istart];
3720  } else {
3721  /* No unaligned bytes - we are done */
3722  return (0);
3723  }
3724  } else {
3725  /* Setup to process whole image */
3726  istart = 0;
3727  cursrc1 = Src1;
3728  curdest = Dest;
3729  }
3730 
3731  /* C routine to process image */
3732  for (i = istart; i < length; i++) {
3733  if (*cursrc1 < Tmin) {
3734  *curdest = Tmin;
3735  } else if (*cursrc1 > Tmax) {
3736  *curdest = Tmax;
3737  } else {
3738  *curdest = *cursrc1;
3739  }
3740  /* Advance pointers */
3741  cursrc1++;
3742  curdest++;
3743  }
3744 
3745  return (0);
3746 }
3747 
3761 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
3762  int Nmin, int Nmax)
3763 {
3764 #ifdef USE_MMX
3765 #if !defined(GCC__)
3766  __asm
3767  {
3768  pusha
3769  mov ax, WORD PTR Nmax /* load Nmax in AX */
3770  mov bx, WORD PTR Cmax /* load Cmax in BX */
3771  sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */
3772  sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */
3773  jz L10311 /* check division by zero */
3774  xor dx, dx /* prepare for division, zero DX */
3775  div bx /* AX = AX/BX */
3776  jmp L10312
3777 L10311:
3778  mov ax, 255 /* if div by zero, assume result max byte value */
3779 L10312: /* ** Duplicate AX in 4 words of MM0 ** */
3780  mov bx, ax /* copy AX into BX */
3781  shl eax, 16 /* shift 2 bytes of EAX left */
3782  mov ax, bx /* copy BX into AX */
3783  movd mm0, eax /* copy EAX into MM0 */
3784  movd mm1, eax /* copy EAX into MM1 */
3785  punpckldq mm0, mm1 /* fill higher words of MM0 with AX */
3786  /* ** Duplicate Cmin in 4 words of MM1 ** */
3787  mov ax, WORD PTR Cmin /* load Cmin into AX */
3788  mov bx, ax /* copy AX into BX */
3789  shl eax, 16 /* shift 2 bytes of EAX left */
3790  mov ax, bx /* copy BX into AX */
3791  movd mm1, eax /* copy EAX into MM1 */
3792  movd mm2, eax /* copy EAX into MM2 */
3793  punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */
3794  /* ** Duplicate Nmin in 4 words of MM2 ** */
3795  mov ax, WORD PTR Nmin /* load Nmin into AX */
3796  mov bx, ax /* copy AX into BX */
3797  shl eax, 16 /* shift 2 bytes of EAX left */
3798  mov ax, bx /* copy BX into AX */
3799  movd mm2, eax /* copy EAX into MM2 */
3800  movd mm3, eax /* copy EAX into MM3 */
3801  punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */
3802  pxor mm7, mm7 /* zero MM7 register */
3803  mov eax, Src1 /* load Src1 address into eax */
3804  mov edi, Dest /* load Dest address into edi */
3805  mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3806  shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3807  align 16 /* 16 byte alignment of the loop entry */
3808 L1031:
3809  movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3810  movq mm4, mm3 /* copy MM3 into MM4 */
3811  punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */
3812  punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */
3813  psubusb mm3, mm1 /* S-Cmin, low bytes */
3814  psubusb mm4, mm1 /* S-Cmin, high bytes */
3815  pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */
3816  pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */
3817  paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */
3818  paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */
3819  /* ** Take abs value of the signed words ** */
3820  movq mm5, mm3 /* copy mm3 into mm5 */
3821  movq mm6, mm4 /* copy mm4 into mm6 */
3822  psraw mm5, 15 /* fill mm5 words with word sign bit */
3823  psraw mm6, 15 /* fill mm6 words with word sign bit */
3824  pxor mm3, mm5 /* take 1's compliment of only neg words */
3825  pxor mm4, mm6 /* take 1's compliment of only neg words */
3826  psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
3827  psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
3828  packuswb mm3, mm4 /* pack words back into bytes with saturation */
3829  movq [edi], mm3 /* store result in Dest */
3830  add eax, 8 /* increase Src1 register pointer by 8 */
3831  add edi, 8 /* increase Dest register pointer by 8 */
3832  dec ecx /* decrease loop counter */
3833  jnz L1031 /* check loop termination, proceed if required */
3834  emms /* exit MMX state */
3835  popa
3836  }
3837 #else
3838  /* i386 and x86_64 */
3839  __m64 *mSrc1 = (__m64*)Src1;
3840  __m64 *mDest = (__m64*)Dest;
3841  __m64 mm0, mm1, mm2, mm3;
3842 
3843  int i;
3844  /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
3845  unsigned short a = Nmax - Nmin;
3846  unsigned short b = Cmax - Cmin;
3847  if (b == 0) {
3848  a = 255;
3849  } else {
3850  a /= b;
3851  }
3852  i = (a<<16)|a;
3853  mm0 = _m_from_int(i);
3854  mm1 = _m_from_int(i);
3855  mm0 = _m_punpckldq(mm0, mm1); /* fill higher words of MM0 with AX */
3856  /* Duplicate Cmin in 4 words of MM1 */
3857  i = (Cmin<<16)|(short)Cmin;
3858  mm1 = _m_from_int(i);
3859  mm2 = _m_from_int(i);
3860  mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with Cmin */
3861  /* Duplicate Nmin in 4 words of MM2 */
3862  i = (Nmin<<16)|(short)Nmin;
3863  mm2 = _m_from_int(i);
3864  mm3 = _m_from_int(i);
3865  mm2 = _m_punpckldq(mm2, mm3); /* fill higher words of MM2 with Nmin */
3866  __m64 mm7 = _m_from_int(0); /* zero mm0 register */
3867  for (i = 0; i < SrcLength/8; i++) {
3868  __m64 mm3, mm4, mm5, mm6;
3869  mm3 = _m_punpcklbw(*mSrc1, mm7); /* unpack low bytes of Src1 into words */
3870  mm4 = _m_punpckhbw(*mSrc1, mm7); /* unpack high bytes of Src1 into words */
3871  mm3 = _m_psubusb(mm3, mm1); /* S-Cmin, low bytes */
3872  mm4 = _m_psubusb(mm4, mm1); /* S-Cmin, high bytes */
3873  mm3 = _m_pmullw(mm3, mm0); /* MM0*(S-Cmin), low bytes */
3874  mm4 = _m_pmullw(mm4, mm0); /* MM0*(S-Cmin), high bytes */
3875  mm3 = _m_paddusb(mm3, mm2); /* MM0*(S-Cmin)+Nmin, low bytes */
3876  mm4 = _m_paddusb(mm4, mm2); /* MM0*(S-Cmin)+Nmin, high bytes */
3877  /* Take abs value of the signed words */
3878  mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
3879  mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
3880  mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
3881  mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
3882  mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
3883  mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
3884  *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3885  mSrc1++;
3886  mDest++;
3887  }
3888  _m_empty(); /* clean MMX state */
3889 #endif
3890  return (0);
3891 #else
3892  return (-1);
3893 #endif
3894 }
3895 
3909 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
3910  int Nmax)
3911 {
3912  unsigned int i, istart;
3913  unsigned char *cursrc;
3914  unsigned char *curdest;
3915  int dN, dC, factor;
3916  int result;
3917 
3918  /* Validate input parameters */
3919  if ((Src == NULL) || (Dest == NULL))
3920  return(-1);
3921  if (length == 0)
3922  return(0);
3923 
3924  if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3925 
3926  SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
3927 
3928  /* Check for unaligned bytes */
3929  if ((length & 7) > 0) {
3930  /* Setup to process unaligned bytes */
3931  istart = length & 0xfffffff8;
3932  cursrc = &Src[istart];
3933  curdest = &Dest[istart];
3934  } else {
3935  /* No unaligned bytes - we are done */
3936  return (0);
3937  }
3938  } else {
3939  /* Setup to process whole image */
3940  istart = 0;
3941  cursrc = Src;
3942  curdest = Dest;
3943  }
3944 
3945  /* C routine to process image */
3946  dC = Cmax - Cmin;
3947  if (dC == 0)
3948  return (0);
3949  dN = Nmax - Nmin;
3950  factor = dN / dC;
3951  for (i = istart; i < length; i++) {
3952  result = factor * ((int) (*cursrc) - Cmin) + Nmin;
3953  if (result > 255)
3954  result = 255;
3955  *curdest = (unsigned char) result;
3956  /* Advance pointers */
3957  cursrc++;
3958  curdest++;
3959  }
3960 
3961  return (0);
3962 }
3963 
3964 /* ------------------------------------------------------------------------------------ */
3965 
3980 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
3981  signed short *Kernel, unsigned char Divisor)
3982 {
3983  /* Validate input parameters */
3984  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
3985  return(-1);
3986 
3987  if ((columns < 3) || (rows < 3) || (Divisor == 0))
3988  return (-1);
3989 
3990  if ((SDL_imageFilterMMXdetect())) {
3991 //#ifdef USE_MMX
3992 #if defined(USE_MMX) && defined(i386)
3993 #if !defined(GCC__)
3994  __asm
3995  {
3996  pusha
3997  pxor mm0, mm0 /* zero MM0 */
3998  xor ebx, ebx /* zero EBX */
3999  mov bl, Divisor /* load Divisor into BL */
4000  mov edx, Kernel /* load Kernel address into EDX */
4001  movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
4002  add edx, 8 /* second row |K0 K1 K2 0| */
4003  movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
4004  add edx, 8 /* third row |K6 K7 K8 0| */
4005  movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
4006  /* ---, */
4007  mov eax, columns /* load columns into EAX */
4008  mov esi, Src /* ESI = Src row 0 address */
4009  mov edi, Dest /* load Dest address to EDI */
4010  add edi, eax /* EDI = EDI + columns */
4011  inc edi /* 1 byte offset from the left edge */
4012  mov edx, rows /* initialize ROWS counter */
4013  sub edx, 2 /* do not use first and last row */
4014  /* ---, */
4015 L10320:
4016  mov ecx, eax /* initialize COLUMS counter */
4017  sub ecx, 2 /* do not use first and last column */
4018  align 16 /* 16 byte alignment of the loop entry */
4019 L10322:
4020  /* ---, */
4021  movq mm1, [esi] /* load 8 bytes of the image first row */
4022  add esi, eax /* move one row below */
4023  movq mm2, [esi] /* load 8 bytes of the image second row */
4024  add esi, eax /* move one row below */
4025  movq mm3, [esi] /* load 8 bytes of the image third row */
4026  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4027  punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
4028  punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
4029  pmullw mm1, mm5 /* multiply words first row image*Kernel */
4030  pmullw mm2, mm6 /* multiply words second row image*Kernel */
4031  pmullw mm3, mm7 /* multiply words third row image*Kernel */
4032  paddsw mm1, mm2 /* add 4 words of the first and second rows */
4033  paddsw mm1, mm3 /* add 4 words of the third row and result */
4034  movq mm2, mm1 /* copy MM1 into MM2 */
4035  psrlq mm1, 32 /* shift 2 left words to the right */
4036  paddsw mm1, mm2 /* add 2 left and 2 right result words */
4037  movq mm3, mm1 /* copy MM1 into MM3 */
4038  psrlq mm1, 16 /* shift 1 left word to the right */
4039  paddsw mm1, mm3 /* add 1 left and 1 right result words */
4040  /* --, */
4041  movd mm2, eax /* save EAX in MM2 */
4042  movd mm3, edx /* save EDX in MM3 */
4043  movd eax, mm1 /* copy MM1 into EAX */
4044  psraw mm1, 15 /* spread sign bit of the result */
4045  movd edx, mm1 /* fill EDX with a sign bit */
4046  idiv bx /* IDIV - VERY EXPENSIVE */
4047  movd mm1, eax /* move result of division into MM1 */
4048  packuswb mm1, mm0 /* pack division result with saturation */
4049  movd eax, mm1 /* copy saturated result into EAX */
4050  mov [edi], al /* copy a byte result into Dest */
4051  movd edx, mm3 /* restore saved EDX */
4052  movd eax, mm2 /* restore saved EAX */
4053  /* --, */
4054  sub esi, eax /* move two rows up */
4055  sub esi, eax /* */
4056  inc esi /* move Src pointer to the next pixel */
4057  inc edi /* move Dest pointer to the next pixel */
4058  /* ---, */
4059  dec ecx /* decrease loop counter COLUMNS */
4060  jnz L10322 /* check loop termination, proceed if required */
4061  add esi, 2 /* move to the next row in Src */
4062  add edi, 2 /* move to the next row in Dest */
4063  dec edx /* decrease loop counter ROWS */
4064  jnz L10320 /* check loop termination, proceed if required */
4065  /* ---, */
4066  emms /* exit MMX state */
4067  popa
4068  }
4069 #else
4070  asm volatile
4071  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4072  "xor %%ebx, %%ebx \n\t" /* zero EBX */
4073  "mov %5, %%bl \n\t" /* load Divisor into BL */
4074  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4075  "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
4076  "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
4077  "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
4078  "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
4079  "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
4080  /* --- */
4081  "mov %3, %%eax \n\t" /* load columns into EAX */
4082  "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
4083  "mov %0, %%edi \n\t" /* load Dest address to EDI */
4084  "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
4085  "inc %%edi \n\t" /* 1 byte offset from the left edge */
4086  "mov %2, %%edx \n\t" /* initialize ROWS counter */
4087  "sub $2, %%edx \n\t" /* do not use first and last row */
4088  /* --- */
4089  ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
4090  "sub $2, %%ecx \n\t" /* do not use first and last column */
4091  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4092  ".L10322: \n\t"
4093  /* --- */
4094  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
4095  "add %%eax, %%esi \n\t" /* move one row below */
4096  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
4097  "add %%eax, %%esi \n\t" /* move one row below */
4098  "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
4099  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4100  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
4101  "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
4102  "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
4103  "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
4104  "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
4105  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
4106  "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
4107  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4108  "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
4109  "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
4110  "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
4111  "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
4112  "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
4113  /* -- */
4114  "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */
4115  "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4116  "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */
4117  "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */
4118  "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */
4119  "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4120  "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */
4121  "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */
4122  "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */
4123  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4124  "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4125  "movd %%mm2, %%eax \n\t" /* restore saved EAX */
4126  /* -- */
4127  "sub %%eax, %%esi \n\t" /* move two rows up */
4128  "sub %%eax, %%esi \n\t" /* */
4129  "inc %%esi \n\t" /* move Src pointer to the next pixel */
4130  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4131  /* --- */
4132  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4133  "jnz .L10322 \n\t" /* check loop termination, proceed if required */
4134  "add $2, %%esi \n\t" /* move to the next row in Src */
4135  "add $2, %%edi \n\t" /* move to the next row in Dest */
4136  "dec %%edx \n\t" /* decrease loop counter ROWS */
4137  "jnz .L10320 \n\t" /* check loop termination, proceed if required */
4138  /* --- */
4139  "emms \n\t" /* exit MMX state */
4140  "popa \n\t":"=m" (Dest) /* %0 */
4141  :"m"(Src), /* %1 */
4142  "m"(rows), /* %2 */
4143  "m"(columns), /* %3 */
4144  "m"(Kernel), /* %4 */
4145  "m"(Divisor) /* %5 */
4146  );
4147 #endif
4148 #endif
4149  return (0);
4150  } else {
4151  /* No non-MMX implementation yet */
4152  return (-1);
4153  }
4154 }
4155 
4170 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4171  signed short *Kernel, unsigned char Divisor)
4172 {
4173  /* Validate input parameters */
4174  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4175  return(-1);
4176 
4177  if ((columns < 5) || (rows < 5) || (Divisor == 0))
4178  return (-1);
4179 
4180  if ((SDL_imageFilterMMXdetect())) {
4181 //#ifdef USE_MMX
4182 #if defined(USE_MMX) && defined(i386)
4183 #if !defined(GCC__)
4184  __asm
4185  {
4186  pusha
4187  pxor mm0, mm0 /* zero MM0 */
4188  xor ebx, ebx /* zero EBX */
4189  mov bl, Divisor /* load Divisor into BL */
4190  movd mm5, ebx /* copy Divisor into MM5 */
4191  mov edx, Kernel /* load Kernel address into EDX */
4192  mov esi, Src /* load Src address to ESI */
4193  mov edi, Dest /* load Dest address to EDI */
4194  add edi, 2 /* 2 column offset from the left edge */
4195  mov eax, columns /* load columns into EAX */
4196  shl eax, 1 /* EAX = columns * 2 */
4197  add edi, eax /* 2 row offset from the top edge */
4198  shr eax, 1 /* EAX = columns */
4199  mov ebx, rows /* initialize ROWS counter */
4200  sub ebx, 4 /* do not use first 2 and last 2 rows */
4201  /* ---, */
4202 L10330:
4203  mov ecx, eax /* initialize COLUMNS counter */
4204  sub ecx, 4 /* do not use first 2 and last 2 columns */
4205  align 16 /* 16 byte alignment of the loop entry */
4206 L10332:
4207  pxor mm7, mm7 /* zero MM7 (accumulator) */
4208  movd mm6, esi /* save ESI in MM6 */
4209  /* --- 1 */
4210  movq mm1, [esi] /* load 8 bytes of the Src */
4211  movq mm2, mm1 /* copy MM1 into MM2 */
4212  add esi, eax /* move Src pointer 1 row below */
4213  movq mm3, [edx] /* load 4 words of Kernel */
4214  add edx, 8 /* move pointer to other 4 words */
4215  movq mm4, [edx] /* load 4 words of Kernel */
4216  add edx, 8 /* move pointer to other 4 words */
4217  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4218  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4219  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4220  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4221  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4222  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4223  /* --- 2 */
4224  movq mm1, [esi] /* load 8 bytes of the Src */
4225  movq mm2, mm1 /* copy MM1 into MM2 */
4226  add esi, eax /* move Src pointer 1 row below */
4227  movq mm3, [edx] /* load 4 words of Kernel */
4228  add edx, 8 /* move pointer to other 4 words */
4229  movq mm4, [edx] /* load 4 words of Kernel */
4230  add edx, 8 /* move pointer to other 4 words */
4231  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4232  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4233  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4234  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4235  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4236  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4237  /* --- 3 */
4238  movq mm1, [esi] /* load 8 bytes of the Src */
4239  movq mm2, mm1 /* copy MM1 into MM2 */
4240  add esi, eax /* move Src pointer 1 row below */
4241  movq mm3, [edx] /* load 4 words of Kernel */
4242  add edx, 8 /* move pointer to other 4 words */
4243  movq mm4, [edx] /* load 4 words of Kernel */
4244  add edx, 8 /* move pointer to other 4 words */
4245  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4246  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4247  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4248  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4249  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4250  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4251  /* --- 4 */
4252  movq mm1, [esi] /* load 8 bytes of the Src */
4253  movq mm2, mm1 /* copy MM1 into MM2 */
4254  add esi, eax /* move Src pointer 1 row below */
4255  movq mm3, [edx] /* load 4 words of Kernel */
4256  add edx, 8 /* move pointer to other 4 words */
4257  movq mm4, [edx] /* load 4 words of Kernel */
4258  add edx, 8 /* move pointer to other 4 words */
4259  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4260  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4261  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4262  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4263  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4264  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4265  /* --- 5 */
4266  movq mm1, [esi] /* load 8 bytes of the Src */
4267  movq mm2, mm1 /* copy MM1 into MM2 */
4268  movq mm3, [edx] /* load 4 words of Kernel */
4269  add edx, 8 /* move pointer to other 4 words */
4270  movq mm4, [edx] /* load 4 words of Kernel */
4271  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4272  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4273  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4274  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4275  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4276  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4277  /* ---, */
4278  movq mm3, mm7 /* copy MM7 into MM3 */
4279  psrlq mm7, 32 /* shift 2 left words to the right */
4280  paddsw mm7, mm3 /* add 2 left and 2 right result words */
4281  movq mm2, mm7 /* copy MM7 into MM2 */
4282  psrlq mm7, 16 /* shift 1 left word to the right */
4283  paddsw mm7, mm2 /* add 1 left and 1 right result words */
4284  /* ---, */
4285  movd mm1, eax /* save EDX in MM1 */
4286  movd mm2, ebx /* save EDX in MM2 */
4287  movd mm3, edx /* save EDX in MM3 */
4288  movd eax, mm7 /* load summation result into EAX */
4289  psraw mm7, 15 /* spread sign bit of the result */
4290  movd ebx, mm5 /* load Divisor into EBX */
4291  movd edx, mm7 /* fill EDX with a sign bit */
4292  idiv bx /* IDIV - VERY EXPENSIVE */
4293  movd mm7, eax /* move result of division into MM7 */
4294  packuswb mm7, mm0 /* pack division result with saturation */
4295  movd eax, mm7 /* copy saturated result into EAX */
4296  mov [edi], al /* copy a byte result into Dest */
4297  movd edx, mm3 /* restore saved EDX */
4298  movd ebx, mm2 /* restore saved EBX */
4299  movd eax, mm1 /* restore saved EAX */
4300  /* --, */
4301  movd esi, mm6 /* move Src pointer to the top pixel */
4302  sub edx, 72 /* EDX = Kernel address */
4303  inc esi /* move Src pointer to the next pixel */
4304  inc edi /* move Dest pointer to the next pixel */
4305  /* ---, */
4306  dec ecx /* decrease loop counter COLUMNS */
4307  jnz L10332 /* check loop termination, proceed if required */
4308  add esi, 4 /* move to the next row in Src */
4309  add edi, 4 /* move to the next row in Dest */
4310  dec ebx /* decrease loop counter ROWS */
4311  jnz L10330 /* check loop termination, proceed if required */
4312  /* ---, */
4313  emms /* exit MMX state */
4314  popa
4315  }
4316 #else
4317  asm volatile
4318  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4319  "xor %%ebx, %%ebx \n\t" /* zero EBX */
4320  "mov %5, %%bl \n\t" /* load Divisor into BL */
4321  "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
4322  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4323  "mov %1, %%esi \n\t" /* load Src address to ESI */
4324  "mov %0, %%edi \n\t" /* load Dest address to EDI */
4325  "add $2, %%edi \n\t" /* 2 column offset from the left edge */
4326  "mov %3, %%eax \n\t" /* load columns into EAX */
4327  "shl $1, %%eax \n\t" /* EAX = columns * 2 */
4328  "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
4329  "shr $1, %%eax \n\t" /* EAX = columns */
4330  "mov %2, %%ebx \n\t" /* initialize ROWS counter */
4331  "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
4332  /* --- */
4333  ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
4334  "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
4335  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4336  ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
4337  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
4338  /* --- 1 */
4339  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4340  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4341  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4342  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4343  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4344  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4345  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4346  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4347  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4348  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4349  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4350  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4351  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4352  /* --- 2 */
4353  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4354  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4355  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4356  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4357  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4358  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4359  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4360  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4361  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4362  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4363  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4364  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4365  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4366  /* --- 3 */
4367  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4368  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4369  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4370  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4371  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4372  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4373  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4374  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4375  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4376  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4377  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4378  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4379  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4380  /* --- 4 */
4381  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4382  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4383  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4384  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4385  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4386  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4387  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4388  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4389  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4390  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4391  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4392  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4393  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4394  /* --- 5 */
4395  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4396  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4397  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4398  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4399  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4400  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4401  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4402  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4403  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4404  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4405  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4406  /* --- */
4407  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
4408  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
4409  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
4410  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
4411  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
4412  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
4413  /* --- */
4414  "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
4415  "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
4416  "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4417  "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
4418  "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
4419  "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
4420  "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
4421  "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4422  "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
4423  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
4424  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
4425  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4426  "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4427  "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
4428  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
4429  /* -- */
4430  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
4431  "sub $72, %%edx \n\t" /* EDX = Kernel address */
4432  "inc %%esi \n\t" /* move Src pointer to the next pixel */
4433  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4434  /* --- */
4435  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4436  "jnz .L10332 \n\t" /* check loop termination, proceed if required */
4437  "add $4, %%esi \n\t" /* move to the next row in Src */
4438  "add $4, %%edi \n\t" /* move to the next row in Dest */
4439  "dec %%ebx \n\t" /* decrease loop counter ROWS */
4440  "jnz .L10330 \n\t" /* check loop termination, proceed if required */
4441  /* --- */
4442  "emms \n\t" /* exit MMX state */
4443  "popa \n\t":"=m" (Dest) /* %0 */
4444  :"m"(Src), /* %1 */
4445  "m"(rows), /* %2 */
4446  "m"(columns), /* %3 */
4447  "m"(Kernel), /* %4 */
4448  "m"(Divisor) /* %5 */
4449  );
4450 #endif
4451 #endif
4452  return (0);
4453  } else {
4454  /* No non-MMX implementation yet */
4455  return (-1);
4456  }
4457 }
4458 
4473 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4474  signed short *Kernel, unsigned char Divisor)
4475 {
4476  /* Validate input parameters */
4477  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4478  return(-1);
4479 
4480  if ((columns < 7) || (rows < 7) || (Divisor == 0))
4481  return (-1);
4482 
4483  if ((SDL_imageFilterMMXdetect())) {
4484 //#ifdef USE_MMX
4485 #if defined(USE_MMX) && defined(i386)
4486 #if !defined(GCC__)
4487  __asm
4488  {
4489  pusha
4490  pxor mm0, mm0 /* zero MM0 */
4491  xor ebx, ebx /* zero EBX */
4492  mov bl, Divisor /* load Divisor into BL */
4493  movd mm5, ebx /* copy Divisor into MM5 */
4494  mov edx, Kernel /* load Kernel address into EDX */
4495  mov esi, Src /* load Src address to ESI */
4496  mov edi, Dest /* load Dest address to EDI */
4497  add edi, 3 /* 3 column offset from the left edge */
4498  mov eax, columns /* load columns into EAX */
4499  add edi, eax /* 3 row offset from the top edge */
4500  add edi, eax
4501  add edi, eax
4502  mov ebx, rows /* initialize ROWS counter */
4503  sub ebx, 6 /* do not use first 3 and last 3 rows */
4504  /* ---, */
4505 L10340:
4506  mov ecx, eax /* initialize COLUMNS counter */
4507  sub ecx, 6 /* do not use first 3 and last 3 columns */
4508  align 16 /* 16 byte alignment of the loop entry */
4509 L10342:
4510  pxor mm7, mm7 /* zero MM7 (accumulator) */
4511  movd mm6, esi /* save ESI in MM6 */
4512  /* --- 1 */
4513  movq mm1, [esi] /* load 8 bytes of the Src */
4514  movq mm2, mm1 /* copy MM1 into MM2 */
4515  add esi, eax /* move Src pointer 1 row below */
4516  movq mm3, [edx] /* load 4 words of Kernel */
4517  add edx, 8 /* move pointer to other 4 words */
4518  movq mm4, [edx] /* load 4 words of Kernel */
4519  add edx, 8 /* move pointer to other 4 words */
4520  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4521  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4522  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4523  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4524  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4525  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4526  /* --- 2 */
4527  movq mm1, [esi] /* load 8 bytes of the Src */
4528  movq mm2, mm1 /* copy MM1 into MM2 */
4529  add esi, eax /* move Src pointer 1 row below */
4530  movq mm3, [edx] /* load 4 words of Kernel */
4531  add edx, 8 /* move pointer to other 4 words */
4532  movq mm4, [edx] /* load 4 words of Kernel */
4533  add edx, 8 /* move pointer to other 4 words */
4534  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4535  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4536  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4537  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4538  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4539  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4540  /* --- 3 */
4541  movq mm1, [esi] /* load 8 bytes of the Src */
4542  movq mm2, mm1 /* copy MM1 into MM2 */
4543  add esi, eax /* move Src pointer 1 row below */
4544  movq mm3, [edx] /* load 4 words of Kernel */
4545  add edx, 8 /* move pointer to other 4 words */
4546  movq mm4, [edx] /* load 4 words of Kernel */
4547  add edx, 8 /* move pointer to other 4 words */
4548  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4549  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4550  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4551  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4552  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4553  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4554  /* --- 4 */
4555  movq mm1, [esi] /* load 8 bytes of the Src */
4556  movq mm2, mm1 /* copy MM1 into MM2 */
4557  add esi, eax /* move Src pointer 1 row below */
4558  movq mm3, [edx] /* load 4 words of Kernel */
4559  add edx, 8 /* move pointer to other 4 words */
4560  movq mm4, [edx] /* load 4 words of Kernel */
4561  add edx, 8 /* move pointer to other 4 words */
4562  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4563  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4564  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4565  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4566  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4567  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4568  /* --- 5 */
4569  movq mm1, [esi] /* load 8 bytes of the Src */
4570  movq mm2, mm1 /* copy MM1 into MM2 */
4571  add esi, eax /* move Src pointer 1 row below */
4572  movq mm3, [edx] /* load 4 words of Kernel */
4573  add edx, 8 /* move pointer to other 4 words */
4574  movq mm4, [edx] /* load 4 words of Kernel */
4575  add edx, 8 /* move pointer to other 4 words */
4576  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4577  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4578  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4579  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4580  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4581  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4582  /* --- 6 */
4583  movq mm1, [esi] /* load 8 bytes of the Src */
4584  movq mm2, mm1 /* copy MM1 into MM2 */
4585  add esi, eax /* move Src pointer 1 row below */
4586  movq mm3, [edx] /* load 4 words of Kernel */
4587  add edx, 8 /* move pointer to other 4 words */
4588  movq mm4, [edx] /* load 4 words of Kernel */
4589  add edx, 8 /* move pointer to other 4 words */
4590  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4591  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4592  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4593  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4594  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4595  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4596  /* --- 7 */
4597  movq mm1, [esi] /* load 8 bytes of the Src */
4598  movq mm2, mm1 /* copy MM1 into MM2 */
4599  movq mm3, [edx] /* load 4 words of Kernel */
4600  add edx, 8 /* move pointer to other 4 words */
4601  movq mm4, [edx] /* load 4 words of Kernel */
4602  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4603  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4604  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4605  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4606  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4607  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4608  /* ---, */
4609  movq mm3, mm7 /* copy MM7 into MM3 */
4610  psrlq mm7, 32 /* shift 2 left words to the right */
4611  paddsw mm7, mm3 /* add 2 left and 2 right result words */
4612  movq mm2, mm7 /* copy MM7 into MM2 */
4613  psrlq mm7, 16 /* shift 1 left word to the right */
4614  paddsw mm7, mm2 /* add 1 left and 1 right result words */
4615  /* ---, */
4616  movd mm1, eax /* save EDX in MM1 */
4617  movd mm2, ebx /* save EDX in MM2 */
4618  movd mm3, edx /* save EDX in MM3 */
4619  movd eax, mm7 /* load summation result into EAX */
4620  psraw mm7, 15 /* spread sign bit of the result */
4621  movd ebx, mm5 /* load Divisor into EBX */
4622  movd edx, mm7 /* fill EDX with a sign bit */
4623  idiv bx /* IDIV - VERY EXPENSIVE */
4624  movd mm7, eax /* move result of division into MM7 */
4625  packuswb mm7, mm0 /* pack division result with saturation */
4626  movd eax, mm7 /* copy saturated result into EAX */
4627  mov [edi], al /* copy a byte result into Dest */
4628  movd edx, mm3 /* restore saved EDX */
4629  movd ebx, mm2 /* restore saved EBX */
4630  movd eax, mm1 /* restore saved EAX */
4631  /* --, */
4632  movd esi, mm6 /* move Src pointer to the top pixel */
4633  sub edx, 104 /* EDX = Kernel address */
4634  inc esi /* move Src pointer to the next pixel */
4635  inc edi /* move Dest pointer to the next pixel */
4636  /* ---, */
4637  dec ecx /* decrease loop counter COLUMNS */
4638  jnz L10342 /* check loop termination, proceed if required */
4639  add esi, 6 /* move to the next row in Src */
4640  add edi, 6 /* move to the next row in Dest */
4641  dec ebx /* decrease loop counter ROWS */
4642  jnz L10340 /* check loop termination, proceed if required */
4643  /* ---, */
4644  emms /* exit MMX state */
4645  popa
4646  }
4647 #else
4648  asm volatile
4649  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4650  "xor %%ebx, %%ebx \n\t" /* zero EBX */
4651  "mov %5, %%bl \n\t" /* load Divisor into BL */
4652  "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
4653  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4654  "mov %1, %%esi \n\t" /* load Src address to ESI */
4655  "mov %0, %%edi \n\t" /* load Dest address to EDI */
4656  "add $3, %%edi \n\t" /* 3 column offset from the left edge */
4657  "mov %3, %%eax \n\t" /* load columns into EAX */
4658  "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
4659  "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
4660  "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
4661  /* --- */
4662  ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
4663  "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
4664  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4665  ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
4666  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
4667  /* --- 1 */
4668  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4669  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4670  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4671  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4672  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4673  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4674  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4675  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4676  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4677  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4678  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4679  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4680  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4681  /* --- 2 */
4682  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4683  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4684  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4685  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4686  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4687  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4688  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4689  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4690  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4691  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4692  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4693  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4694  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4695  /* --- 3 */
4696  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4697  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4698  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4699  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4700  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4701  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4702  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4703  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4704  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4705  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4706  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4707  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4708  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4709  /* --- 4 */
4710  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4711  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4712  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4713  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4714  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4715  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4716  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4717  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4718  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4719  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4720  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4721  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4722  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4723  /* --- 5 */
4724  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4725  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4726  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4727  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4728  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4729  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4730  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4731  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4732  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4733  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4734  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4735  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4736  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4737  /* --- 6 */
4738  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4739  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4740  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4741  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4742  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4743  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4744  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4745  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4746  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4747  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4748  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4749  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4750  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4751  /* --- 7 */
4752  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4753  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4754  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4755  "add $8, %%edx \n\t" /* move pointer to other 4 words */
4756  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4757  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4758  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4759  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4760  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4761  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4762  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4763  /* --- */
4764  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
4765  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
4766  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
4767  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
4768  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
4769  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
4770  /* --- */
4771  "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
4772  "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
4773  "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4774  "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
4775  "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
4776  "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
4777  "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
4778  "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4779  "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
4780  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
4781  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
4782  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4783  "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4784  "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
4785  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
4786  /* -- */
4787  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
4788  "sub $104, %%edx \n\t" /* EDX = Kernel address */
4789  "inc %%esi \n\t" /* move Src pointer to the next pixel */
4790  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4791  /* --- */
4792  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4793  "jnz .L10342 \n\t" /* check loop termination, proceed if required */
4794  "add $6, %%esi \n\t" /* move to the next row in Src */
4795  "add $6, %%edi \n\t" /* move to the next row in Dest */
4796  "dec %%ebx \n\t" /* decrease loop counter ROWS */
4797  "jnz .L10340 \n\t" /* check loop termination, proceed if required */
4798  /* --- */
4799  "emms \n\t" /* exit MMX state */
4800  "popa \n\t":"=m" (Dest) /* %0 */
4801  :"m"(Src), /* %1 */
4802  "m"(rows), /* %2 */
4803  "m"(columns), /* %3 */
4804  "m"(Kernel), /* %4 */
4805  "m"(Divisor) /* %5 */
4806  );
4807 #endif
4808 #endif
4809  return (0);
4810  } else {
4811  /* No non-MMX implementation yet */
4812  return (-1);
4813  }
4814 }
4815 
4830 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4831  signed short *Kernel, unsigned char Divisor)
4832 {
4833  /* Validate input parameters */
4834  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4835  return(-1);
4836 
4837  if ((columns < 9) || (rows < 9) || (Divisor == 0))
4838  return (-1);
4839 
4840  if ((SDL_imageFilterMMXdetect())) {
4841 //#ifdef USE_MMX
4842 #if defined(USE_MMX) && defined(i386)
4843 #if !defined(GCC__)
4844  __asm
4845  {
4846  pusha
4847  pxor mm0, mm0 /* zero MM0 */
4848  xor ebx, ebx /* zero EBX */
4849  mov bl, Divisor /* load Divisor into BL */
4850  movd mm5, ebx /* copy Divisor into MM5 */
4851  mov edx, Kernel /* load Kernel address into EDX */
4852  mov esi, Src /* load Src address to ESI */
4853  mov edi, Dest /* load Dest address to EDI */
4854  add edi, 4 /* 4 column offset from the left edge */
4855  mov eax, columns /* load columns into EAX */
4856  add edi, eax /* 4 row offset from the top edge */
4857  add edi, eax
4858  add edi, eax
4859  add edi, eax
4860  mov ebx, rows /* initialize ROWS counter */
4861  sub ebx, 8 /* do not use first 4 and last 4 rows */
4862  /* ---, */
4863 L10350:
4864  mov ecx, eax /* initialize COLUMNS counter */
4865  sub ecx, 8 /* do not use first 4 and last 4 columns */
4866  align 16 /* 16 byte alignment of the loop entry */
4867 L10352:
4868  pxor mm7, mm7 /* zero MM7 (accumulator) */
4869  movd mm6, esi /* save ESI in MM6 */
4870  /* --- 1 */
4871  movq mm1, [esi] /* load 8 bytes of the Src */
4872  movq mm2, mm1 /* copy MM1 into MM2 */
4873  inc esi /* move pointer to the next 8 bytes of Src */
4874  movq mm3, [edx] /* load 4 words of Kernel */
4875  add edx, 8 /* move pointer to other 4 words */
4876  movq mm4, [edx] /* load 4 words of Kernel */
4877  add edx, 8 /* move pointer to other 4 words */
4878  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4879  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4880  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4881  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4882  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4883  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4884  movq mm1, [esi] /* load 8 bytes of the Src */
4885  dec esi
4886  add esi, eax /* move Src pointer 1 row below */
4887  movq mm3, [edx] /* load 4 words of Kernel */
4888  add edx, 8 /* move pointer to other 4 words */
4889  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4890  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4891  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4892  /* --- 2 */
4893  movq mm1, [esi] /* load 8 bytes of the Src */
4894  movq mm2, mm1 /* copy MM1 into MM2 */
4895  inc esi /* move pointer to the next 8 bytes of Src */
4896  movq mm3, [edx] /* load 4 words of Kernel */
4897  add edx, 8 /* move pointer to other 4 words */
4898  movq mm4, [edx] /* load 4 words of Kernel */
4899  add edx, 8 /* move pointer to other 4 words */
4900  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4901  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4902  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4903  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4904  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4905  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4906  movq mm1, [esi] /* load 8 bytes of the Src */
4907  dec esi
4908  add esi, eax /* move Src pointer 1 row below */
4909  movq mm3, [edx] /* load 4 words of Kernel */
4910  add edx, 8 /* move pointer to other 4 words */
4911  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4912  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4913  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4914  /* --- 3 */
4915  movq mm1, [esi] /* load 8 bytes of the Src */
4916  movq mm2, mm1 /* copy MM1 into MM2 */
4917  inc esi /* move pointer to the next 8 bytes of Src */
4918  movq mm3, [edx] /* load 4 words of Kernel */
4919  add edx, 8 /* move pointer to other 4 words */
4920  movq mm4, [edx] /* load 4 words of Kernel */
4921  add edx, 8 /* move pointer to other 4 words */
4922  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4923  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4924  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4925  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4926  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4927  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4928  movq mm1, [esi] /* load 8 bytes of the Src */
4929  dec esi
4930  add esi, eax /* move Src pointer 1 row below */
4931  movq mm3, [edx] /* load 4 words of Kernel */
4932  add edx, 8 /* move pointer to other 4 words */
4933  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4934  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4935  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4936  /* --- 4 */
4937  movq mm1, [esi] /* load 8 bytes of the Src */
4938  movq mm2, mm1 /* copy MM1 into MM2 */
4939  inc esi /* move pointer to the next 8 bytes of Src */
4940  movq mm3, [edx] /* load 4 words of Kernel */
4941  add edx, 8 /* move pointer to other 4 words */
4942  movq mm4, [edx] /* load 4 words of Kernel */
4943  add edx, 8 /* move pointer to other 4 words */
4944  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4945  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4946  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4947  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4948  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4949  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4950  movq mm1, [esi] /* load 8 bytes of the Src */
4951  dec esi
4952  add esi, eax /* move Src pointer 1 row below */
4953  movq mm3, [edx] /* load 4 words of Kernel */
4954  add edx, 8 /* move pointer to other 4 words */
4955  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4956  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4957  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4958  /* --- 5 */
4959  movq mm1, [esi] /* load 8 bytes of the Src */
4960  movq mm2, mm1 /* copy MM1 into MM2 */
4961  inc esi /* move pointer to the next 8 bytes of Src */
4962  movq mm3, [edx] /* load 4 words of Kernel */
4963  add edx, 8 /* move pointer to other 4 words */
4964  movq mm4, [edx] /* load 4 words of Kernel */
4965  add edx, 8 /* move pointer to other 4 words */
4966  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4967  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4968  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4969  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4970  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4971  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4972  movq mm1, [esi] /* load 8 bytes of the Src */
4973  dec esi
4974  add esi, eax /* move Src pointer 1 row below */
4975  movq mm3, [edx] /* load 4 words of Kernel */
4976  add edx, 8 /* move pointer to other 4 words */
4977  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4978  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4979  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4980  /* --- 6 */
4981  movq mm1, [esi] /* load 8 bytes of the Src */
4982  movq mm2, mm1 /* copy MM1 into MM2 */
4983  inc esi /* move pointer to the next 8 bytes of Src */
4984  movq mm3, [edx] /* load 4 words of Kernel */
4985  add edx, 8 /* move pointer to other 4 words */
4986  movq mm4, [edx] /* load 4 words of Kernel */
4987  add edx, 8 /* move pointer to other 4 words */
4988  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4989  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4990  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4991  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4992  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4993  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4994  movq mm1, [esi] /* load 8 bytes of the Src */
4995  dec esi
4996  add esi, eax /* move Src pointer 1 row below */
4997  movq mm3, [edx] /* load 4 words of Kernel */
4998  add edx, 8 /* move pointer to other 4 words */
4999  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5000  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5001  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5002  /* --- 7 */
5003  movq mm1, [esi] /* load 8 bytes of the Src */
5004  movq mm2, mm1 /* copy MM1 into MM2 */
5005  inc esi /* move pointer to the next 8 bytes of Src */
5006  movq mm3, [edx] /* load 4 words of Kernel */
5007  add edx, 8 /* move pointer to other 4 words */
5008  movq mm4, [edx] /* load 4 words of Kernel */
5009  add edx, 8 /* move pointer to other 4 words */
5010  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5011  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5012  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5013  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5014  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5015  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5016  movq mm1, [esi] /* load 8 bytes of the Src */
5017  dec esi
5018  add esi, eax /* move Src pointer 1 row below */
5019  movq mm3, [edx] /* load 4 words of Kernel */
5020  add edx, 8 /* move pointer to other 4 words */
5021  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5022  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5023  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5024  /* --- 8 */
5025  movq mm1, [esi] /* load 8 bytes of the Src */
5026  movq mm2, mm1 /* copy MM1 into MM2 */
5027  inc esi /* move pointer to the next 8 bytes of Src */
5028  movq mm3, [edx] /* load 4 words of Kernel */
5029  add edx, 8 /* move pointer to other 4 words */
5030  movq mm4, [edx] /* load 4 words of Kernel */
5031  add edx, 8 /* move pointer to other 4 words */
5032  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5033  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5034  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5035  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5036  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5037  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5038  movq mm1, [esi] /* load 8 bytes of the Src */
5039  dec esi
5040  add esi, eax /* move Src pointer 1 row below */
5041  movq mm3, [edx] /* load 4 words of Kernel */
5042  add edx, 8 /* move pointer to other 4 words */
5043  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5044  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5045  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5046  /* --- 9 */
5047  movq mm1, [esi] /* load 8 bytes of the Src */
5048  movq mm2, mm1 /* copy MM1 into MM2 */
5049  inc esi /* move pointer to the next 8 bytes of Src */
5050  movq mm3, [edx] /* load 4 words of Kernel */
5051  add edx, 8 /* move pointer to other 4 words */
5052  movq mm4, [edx] /* load 4 words of Kernel */
5053  add edx, 8 /* move pointer to other 4 words */
5054  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5055  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5056  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5057  pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5058  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5059  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5060  movq mm1, [esi] /* load 8 bytes of the Src */
5061  movq mm3, [edx] /* load 4 words of Kernel */
5062  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5063  pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5064  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5065  /* ---, */
5066  movq mm3, mm7 /* copy MM7 into MM3 */
5067  psrlq mm7, 32 /* shift 2 left words to the right */
5068  paddsw mm7, mm3 /* add 2 left and 2 right result words */
5069  movq mm2, mm7 /* copy MM7 into MM2 */
5070  psrlq mm7, 16 /* shift 1 left word to the right */
5071  paddsw mm7, mm2 /* add 1 left and 1 right result words */
5072  /* ---, */
5073  movd mm1, eax /* save EDX in MM1 */
5074  movd mm2, ebx /* save EDX in MM2 */
5075  movd mm3, edx /* save EDX in MM3 */
5076  movd eax, mm7 /* load summation result into EAX */
5077  psraw mm7, 15 /* spread sign bit of the result */
5078  movd ebx, mm5 /* load Divisor into EBX */
5079  movd edx, mm7 /* fill EDX with a sign bit */
5080  idiv bx /* IDIV - VERY EXPENSIVE */
5081  movd mm7, eax /* move result of division into MM7 */
5082  packuswb mm7, mm0 /* pack division result with saturation */
5083  movd eax, mm7 /* copy saturated result into EAX */
5084  mov [edi], al /* copy a byte result into Dest */
5085  movd edx, mm3 /* restore saved EDX */
5086  movd ebx, mm2 /* restore saved EBX */
5087  movd eax, mm1 /* restore saved EAX */
5088  /* --, */
5089  movd esi, mm6 /* move Src pointer to the top pixel */
5090  sub edx, 208 /* EDX = Kernel address */
5091  inc esi /* move Src pointer to the next pixel */
5092  inc edi /* move Dest pointer to the next pixel */
5093  /* ---, */
5094  dec ecx /* decrease loop counter COLUMNS */
5095  jnz L10352 /* check loop termination, proceed if required */
5096  add esi, 8 /* move to the next row in Src */
5097  add edi, 8 /* move to the next row in Dest */
5098  dec ebx /* decrease loop counter ROWS */
5099  jnz L10350 /* check loop termination, proceed if required */
5100  /* ---, */
5101  emms /* exit MMX state */
5102  popa
5103  }
5104 #else
5105  asm volatile
5106  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5107  "xor %%ebx, %%ebx \n\t" /* zero EBX */
5108  "mov %5, %%bl \n\t" /* load Divisor into BL */
5109  "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
5110  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5111  "mov %1, %%esi \n\t" /* load Src address to ESI */
5112  "mov %0, %%edi \n\t" /* load Dest address to EDI */
5113  "add $4, %%edi \n\t" /* 4 column offset from the left edge */
5114  "mov %3, %%eax \n\t" /* load columns into EAX */
5115  "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
5116  "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
5117  "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
5118  /* --- */
5119  ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
5120  "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
5121  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5122  ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
5123  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
5124  /* --- 1 */
5125  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5126  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5127  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5128  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5129  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5130  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5131  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5132  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5133  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5134  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5135  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5136  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5137  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5138  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5139  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5140  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5141  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5142  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5143  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5144  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5145  /* --- 2 */
5146  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5147  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5148  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5149  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5150  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5151  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5152  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5153  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5154  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5155  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5156  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5157  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5158  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5159  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5160  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5161  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5162  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5163  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5164  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5165  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5166  /* --- 3 */
5167  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5168  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5169  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5170  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5171  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5172  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5173  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5174  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5175  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5176  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5177  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5178  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5179  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5180  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5181  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5182  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5183  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5184  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5185  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5186  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5187  /* --- 4 */
5188  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5189  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5190  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5191  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5192  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5193  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5194  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5195  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5196  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5197  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5198  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5199  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5200  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5201  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5202  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5203  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5204  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5205  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5206  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5207  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5208  /* --- 5 */
5209  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5210  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5211  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5212  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5213  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5214  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5215  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5216  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5217  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5218  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5219  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5220  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5221  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5222  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5223  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5224  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5225  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5226  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5227  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5228  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5229  /* --- 6 */
5230  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5231  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5232  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5233  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5234  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5235  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5236  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5237  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5238  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5239  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5240  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5241  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5242  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5243  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5244  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5245  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5246  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5247  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5248  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5249  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5250  /* --- 7 */
5251  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5252  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5253  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5254  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5255  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5256  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5257  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5258  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5259  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5260  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5261  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5262  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5263  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5264  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5265  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5266  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5267  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5268  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5269  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5270  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5271  /* --- 8 */
5272  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5273  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5274  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5275  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5276  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5277  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5278  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5279  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5280  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5281  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5282  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5283  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5284  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5285  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5286  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5287  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5288  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5289  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5290  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5291  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5292  /* --- 9 */
5293  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5294  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5295  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5296  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5297  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5298  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5299  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5300  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5301  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5302  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5303  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5304  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5305  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5306  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5307  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5308  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5309  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5310  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5311  /* --- */
5312  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
5313  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
5314  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
5315  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
5316  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
5317  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
5318  /* --- */
5319  "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
5320  "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
5321  "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
5322  "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
5323  "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
5324  "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
5325  "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
5326  "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
5327  "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
5328  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
5329  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
5330  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
5331  "movd %%mm3, %%edx \n\t" /* restore saved EDX */
5332  "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
5333  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
5334  /* -- */
5335  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
5336  "sub $208, %%edx \n\t" /* EDX = Kernel address */
5337  "inc %%esi \n\t" /* move Src pointer to the next pixel */
5338  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5339  /* --- */
5340  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5341  "jnz .L10352 \n\t" /* check loop termination, proceed if required */
5342  "add $8, %%esi \n\t" /* move to the next row in Src */
5343  "add $8, %%edi \n\t" /* move to the next row in Dest */
5344  "dec %%ebx \n\t" /* decrease loop counter ROWS */
5345  "jnz .L10350 \n\t" /* check loop termination, proceed if required */
5346  /* --- */
5347  "emms \n\t" /* exit MMX state */
5348  "popa \n\t":"=m" (Dest) /* %0 */
5349  :"m"(Src), /* %1 */
5350  "m"(rows), /* %2 */
5351  "m"(columns), /* %3 */
5352  "m"(Kernel), /* %4 */
5353  "m"(Divisor) /* %5 */
5354  );
5355 #endif
5356 #endif
5357  return (0);
5358  } else {
5359  /* No non-MMX implementation yet */
5360  return (-1);
5361  }
5362 }
5363 
5378 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5379  signed short *Kernel, unsigned char NRightShift)
5380 {
5381  /* Validate input parameters */
5382  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5383  return(-1);
5384 
5385  if ((columns < 3) || (rows < 3) || (NRightShift > 7))
5386  return (-1);
5387 
5388  if ((SDL_imageFilterMMXdetect())) {
5389 //#ifdef USE_MMX
5390 #if defined(USE_MMX) && defined(i386)
5391 #if !defined(GCC__)
5392  __asm
5393  {
5394  pusha
5395  pxor mm0, mm0 /* zero MM0 */
5396  xor ebx, ebx /* zero EBX */
5397  mov bl, NRightShift /* load NRightShift into BL */
5398  movd mm4, ebx /* copy NRightShift into MM4 */
5399  mov edx, Kernel /* load Kernel address into EDX */
5400  movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
5401  add edx, 8 /* second row |K0 K1 K2 0| */
5402  movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
5403  add edx, 8 /* third row |K6 K7 K8 0| */
5404  movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
5405  /* ---, */
5406  mov eax, columns /* load columns into EAX */
5407  mov esi, Src /* ESI = Src row 0 address */
5408  mov edi, Dest /* load Dest address to EDI */
5409  add edi, eax /* EDI = EDI + columns */
5410  inc edi /* 1 byte offset from the left edge */
5411  mov edx, rows /* initialize ROWS counter */
5412  sub edx, 2 /* do not use first and last row */
5413  /* ---, */
5414 L10360:
5415  mov ecx, eax /* initialize COLUMS counter */
5416  sub ecx, 2 /* do not use first and last column */
5417  align 16 /* 16 byte alignment of the loop entry */
5418 L10362:
5419  /* ---, */
5420  movq mm1, [esi] /* load 8 bytes of the image first row */
5421  add esi, eax /* move one row below */
5422  movq mm2, [esi] /* load 8 bytes of the image second row */
5423  add esi, eax /* move one row below */
5424  movq mm3, [esi] /* load 8 bytes of the image third row */
5425  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5426  punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
5427  punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
5428  psrlw mm1, mm4 /* shift right each pixel NshiftRight times */
5429  psrlw mm2, mm4 /* shift right each pixel NshiftRight times */
5430  psrlw mm3, mm4 /* shift right each pixel NshiftRight times */
5431  pmullw mm1, mm5 /* multiply words first row image*Kernel */
5432  pmullw mm2, mm6 /* multiply words second row image*Kernel */
5433  pmullw mm3, mm7 /* multiply words third row image*Kernel */
5434  paddsw mm1, mm2 /* add 4 words of the first and second rows */
5435  paddsw mm1, mm3 /* add 4 words of the third row and result */
5436  movq mm2, mm1 /* copy MM1 into MM2 */
5437  psrlq mm1, 32 /* shift 2 left words to the right */
5438  paddsw mm1, mm2 /* add 2 left and 2 right result words */
5439  movq mm3, mm1 /* copy MM1 into MM3 */
5440  psrlq mm1, 16 /* shift 1 left word to the right */
5441  paddsw mm1, mm3 /* add 1 left and 1 right result words */
5442  packuswb mm1, mm0 /* pack shift result with saturation */
5443  movd ebx, mm1 /* copy saturated result into EBX */
5444  mov [edi], bl /* copy a byte result into Dest */
5445  /* --, */
5446  sub esi, eax /* move two rows up */
5447  sub esi, eax
5448  inc esi /* move Src pointer to the next pixel */
5449  inc edi /* move Dest pointer to the next pixel */
5450  /* ---, */
5451  dec ecx /* decrease loop counter COLUMNS */
5452  jnz L10362 /* check loop termination, proceed if required */
5453  add esi, 2 /* move to the next row in Src */
5454  add edi, 2 /* move to the next row in Dest */
5455  dec edx /* decrease loop counter ROWS */
5456  jnz L10360 /* check loop termination, proceed if required */
5457  /* ---, */
5458  emms /* exit MMX state */
5459  popa
5460  }
5461 #else
5462  asm volatile
5463  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5464  "xor %%ebx, %%ebx \n\t" /* zero EBX */
5465  "mov %5, %%bl \n\t" /* load NRightShift into BL */
5466  "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */
5467  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5468  "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
5469  "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
5470  "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
5471  "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
5472  "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
5473  /* --- */
5474  "mov %3, %%eax \n\t" /* load columns into EAX */
5475  "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
5476  "mov %0, %%edi \n\t" /* load Dest address to EDI */
5477  "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
5478  "inc %%edi \n\t" /* 1 byte offset from the left edge */
5479  "mov %2, %%edx \n\t" /* initialize ROWS counter */
5480  "sub $2, %%edx \n\t" /* do not use first and last row */
5481  /* --- */
5482  ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
5483  "sub $2, %%ecx \n\t" /* do not use first and last column */
5484  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5485  ".L10362: \n\t"
5486  /* --- */
5487  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
5488  "add %%eax, %%esi \n\t" /* move one row below */
5489  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
5490  "add %%eax, %%esi \n\t" /* move one row below */
5491  "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
5492  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5493  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
5494  "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
5495  "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5496  "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5497  "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
5498  "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
5499  "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
5500  "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
5501  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
5502  "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
5503  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5504  "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
5505  "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
5506  "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
5507  "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
5508  "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
5509  "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */
5510  "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */
5511  "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */
5512  /* -- */
5513  "sub %%eax, %%esi \n\t" /* move two rows up */
5514  "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */
5515  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5516  /* --- */
5517  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5518  "jnz .L10362 \n\t" /* check loop termination, proceed if required */
5519  "add $2, %%esi \n\t" /* move to the next row in Src */
5520  "add $2, %%edi \n\t" /* move to the next row in Dest */
5521  "dec %%edx \n\t" /* decrease loop counter ROWS */
5522  "jnz .L10360 \n\t" /* check loop termination, proceed if required */
5523  /* --- */
5524  "emms \n\t" /* exit MMX state */
5525  "popa \n\t":"=m" (Dest) /* %0 */
5526  :"m"(Src), /* %1 */
5527  "m"(rows), /* %2 */
5528  "m"(columns), /* %3 */
5529  "m"(Kernel), /* %4 */
5530  "m"(NRightShift) /* %5 */
5531  );
5532 #endif
5533 #endif
5534  return (0);
5535  } else {
5536  /* No non-MMX implementation yet */
5537  return (-1);
5538  }
5539 }
5540 
5555 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5556  signed short *Kernel, unsigned char NRightShift)
5557 {
5558  /* Validate input parameters */
5559  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5560  return(-1);
5561 
5562  if ((columns < 5) || (rows < 5) || (NRightShift > 7))
5563  return (-1);
5564 
5565  if ((SDL_imageFilterMMXdetect())) {
5566 //#ifdef USE_MMX
5567 #if defined(USE_MMX) && defined(i386)
5568 #if !defined(GCC__)
5569  __asm
5570  {
5571  pusha
5572  pxor mm0, mm0 /* zero MM0 */
5573  xor ebx, ebx /* zero EBX */
5574  mov bl, NRightShift /* load NRightShift into BL */
5575  movd mm5, ebx /* copy NRightShift into MM5 */
5576  mov edx, Kernel /* load Kernel address into EDX */
5577  mov esi, Src /* load Src address to ESI */
5578  mov edi, Dest /* load Dest address to EDI */
5579  add edi, 2 /* 2 column offset from the left edge */
5580  mov eax, columns /* load columns into EAX */
5581  shl eax, 1 /* EAX = columns * 2 */
5582  add edi, eax /* 2 row offset from the top edge */
5583  shr eax, 1 /* EAX = columns */
5584  mov ebx, rows /* initialize ROWS counter */
5585  sub ebx, 4 /* do not use first 2 and last 2 rows */
5586  /* ---, */
5587 L10370:
5588  mov ecx, eax /* initialize COLUMNS counter */
5589  sub ecx, 4 /* do not use first 2 and last 2 columns */
5590  align 16 /* 16 byte alignment of the loop entry */
5591 L10372:
5592  pxor mm7, mm7 /* zero MM7 (accumulator) */
5593  movd mm6, esi /* save ESI in MM6 */
5594  /* --- 1 */
5595  movq mm1, [esi] /* load 8 bytes of the Src */
5596  movq mm2, mm1 /* copy MM1 into MM2 */
5597  add esi, eax /* move Src pointer 1 row below */
5598  movq mm3, [edx] /* load 4 words of Kernel */
5599  add edx, 8 /* move pointer to other 4 words */
5600  movq mm4, [edx] /* load 4 words of Kernel */
5601  add edx, 8 /* move pointer to other 4 words */
5602  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5603  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5604  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5605  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5606  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5607  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5608  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5609  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5610  /* --- 2 */
5611  movq mm1, [esi] /* load 8 bytes of the Src */
5612  movq mm2, mm1 /* copy MM1 into MM2 */
5613  add esi, eax /* move Src pointer 1 row below */
5614  movq mm3, [edx] /* load 4 words of Kernel */
5615  add edx, 8 /* move pointer to other 4 words */
5616  movq mm4, [edx] /* load 4 words of Kernel */
5617  add edx, 8 /* move pointer to other 4 words */
5618  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5619  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5620  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5621  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5622  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5623  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5624  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5625  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5626  /* --- 3 */
5627  movq mm1, [esi] /* load 8 bytes of the Src */
5628  movq mm2, mm1 /* copy MM1 into MM2 */
5629  add esi, eax /* move Src pointer 1 row below */
5630  movq mm3, [edx] /* load 4 words of Kernel */
5631  add edx, 8 /* move pointer to other 4 words */
5632  movq mm4, [edx] /* load 4 words of Kernel */
5633  add edx, 8 /* move pointer to other 4 words */
5634  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5635  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5636  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5637  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5638  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5639  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5640  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5641  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5642  /* --- 4 */
5643  movq mm1, [esi] /* load 8 bytes of the Src */
5644  movq mm2, mm1 /* copy MM1 into MM2 */
5645  add esi, eax /* move Src pointer 1 row below */
5646  movq mm3, [edx] /* load 4 words of Kernel */
5647  add edx, 8 /* move pointer to other 4 words */
5648  movq mm4, [edx] /* load 4 words of Kernel */
5649  add edx, 8 /* move pointer to other 4 words */
5650  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5651  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5652  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5653  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5654  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5655  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5656  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5657  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5658  /* --- 5 */
5659  movq mm1, [esi] /* load 8 bytes of the Src */
5660  movq mm2, mm1 /* copy MM1 into MM2 */
5661  movq mm3, [edx] /* load 4 words of Kernel */
5662  add edx, 8 /* move pointer to other 4 words */
5663  movq mm4, [edx] /* load 4 words of Kernel */
5664  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5665  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5666  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5667  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5668  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5669  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5670  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5671  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5672  /* ---, */
5673  movq mm3, mm7 /* copy MM7 into MM3 */
5674  psrlq mm7, 32 /* shift 2 left words to the right */
5675  paddsw mm7, mm3 /* add 2 left and 2 right result words */
5676  movq mm2, mm7 /* copy MM7 into MM2 */
5677  psrlq mm7, 16 /* shift 1 left word to the right */
5678  paddsw mm7, mm2 /* add 1 left and 1 right result words */
5679  movd mm1, eax /* save EAX in MM1 */
5680  packuswb mm7, mm0 /* pack division result with saturation */
5681  movd eax, mm7 /* copy saturated result into EAX */
5682  mov [edi], al /* copy a byte result into Dest */
5683  movd eax, mm1 /* restore saved EAX */
5684  /* --, */
5685  movd esi, mm6 /* move Src pointer to the top pixel */
5686  sub edx, 72 /* EDX = Kernel address */
5687  inc esi /* move Src pointer to the next pixel */
5688  inc edi /* move Dest pointer to the next pixel */
5689  /* ---, */
5690  dec ecx /* decrease loop counter COLUMNS */
5691  jnz L10372 /* check loop termination, proceed if required */
5692  add esi, 4 /* move to the next row in Src */
5693  add edi, 4 /* move to the next row in Dest */
5694  dec ebx /* decrease loop counter ROWS */
5695  jnz L10370 /* check loop termination, proceed if required */
5696  /* ---, */
5697  emms /* exit MMX state */
5698  popa
5699  }
5700 #else
5701  asm volatile
5702  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5703  "xor %%ebx, %%ebx \n\t" /* zero EBX */
5704  "mov %5, %%bl \n\t" /* load NRightShift into BL */
5705  "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
5706  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5707  "mov %1, %%esi \n\t" /* load Src address to ESI */
5708  "mov %0, %%edi \n\t" /* load Dest address to EDI */
5709  "add $2, %%edi \n\t" /* 2 column offset from the left edge */
5710  "mov %3, %%eax \n\t" /* load columns into EAX */
5711  "shl $1, %%eax \n\t" /* EAX = columns * 2 */
5712  "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
5713  "shr $1, %%eax \n\t" /* EAX = columns */
5714  "mov %2, %%ebx \n\t" /* initialize ROWS counter */
5715  "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
5716  /* --- */
5717  ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
5718  "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
5719  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5720  ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
5721  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
5722  /* --- 1 */
5723  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5724  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5725  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5726  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5727  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5728  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5729  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5730  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5731  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5732  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5733  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5734  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5735  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5736  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5737  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5738  /* --- 2 */
5739  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5740  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5741  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5742  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5743  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5744  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5745  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5746  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5747  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5748  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5749  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5750  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5751  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5752  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5753  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5754  /* --- 3 */
5755  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5756  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5757  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5758  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5759  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5760  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5761  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5762  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5763  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5764  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5765  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5766  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5767  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5768  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5769  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5770  /* --- 4 */
5771  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5772  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5773  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5774  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5775  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5776  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5777  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5778  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5779  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5780  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5781  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5782  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5783  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5784  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5785  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5786  /* --- 5 */
5787  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5788  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5789  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5790  "add $8, %%edx \n\t" /* move pointer to other 4 words */
5791  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5792  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5793  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5794  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5795  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5796  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5797  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5798  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5799  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5800  /* --- */
5801  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
5802  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
5803  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
5804  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
5805  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
5806  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
5807  "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
5808  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
5809  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
5810  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
5811  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
5812  /* -- */
5813  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
5814  "sub $72, %%edx \n\t" /* EDX = Kernel address */
5815  "inc %%esi \n\t" /* move Src pointer to the next pixel */
5816  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5817  /* --- */
5818  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5819  "jnz .L10372 \n\t" /* check loop termination, proceed if required */
5820  "add $4, %%esi \n\t" /* move to the next row in Src */
5821  "add $4, %%edi \n\t" /* move to the next row in Dest */
5822  "dec %%ebx \n\t" /* decrease loop counter ROWS */
5823  "jnz .L10370 \n\t" /* check loop termination, proceed if required */
5824  /* --- */
5825  "emms \n\t" /* exit MMX state */
5826  "popa \n\t":"=m" (Dest) /* %0 */
5827  :"m"(Src), /* %1 */
5828  "m"(rows), /* %2 */
5829  "m"(columns), /* %3 */
5830  "m"(Kernel), /* %4 */
5831  "m"(NRightShift) /* %5 */
5832  );
5833 #endif
5834 #endif
5835  return (0);
5836  } else {
5837  /* No non-MMX implementation yet */
5838  return (-1);
5839  }
5840 }
5841 
5856 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5857  signed short *Kernel, unsigned char NRightShift)
5858 {
5859  /* Validate input parameters */
5860  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5861  return(-1);
5862 
5863  if ((columns < 7) || (rows < 7) || (NRightShift > 7))
5864  return (-1);
5865 
5866  if ((SDL_imageFilterMMXdetect())) {
5867 //#ifdef USE_MMX
5868 #if defined(USE_MMX) && defined(i386)
5869 #if !defined(GCC__)
5870  __asm
5871  {
5872  pusha
5873  pxor mm0, mm0 /* zero MM0 */
5874  xor ebx, ebx /* zero EBX */
5875  mov bl, NRightShift /* load NRightShift into BL */
5876  movd mm5, ebx /* copy NRightShift into MM5 */
5877  mov edx, Kernel /* load Kernel address into EDX */
5878  mov esi, Src /* load Src address to ESI */
5879  mov edi, Dest /* load Dest address to EDI */
5880  add edi, 3 /* 3 column offset from the left edge */
5881  mov eax, columns /* load columns into EAX */
5882  add edi, eax /* 3 row offset from the top edge */
5883  add edi, eax
5884  add edi, eax
5885  mov ebx, rows /* initialize ROWS counter */
5886  sub ebx, 6 /* do not use first 3 and last 3 rows */
5887  /* ---, */
5888 L10380:
5889  mov ecx, eax /* initialize COLUMNS counter */
5890  sub ecx, 6 /* do not use first 3 and last 3 columns */
5891  align 16 /* 16 byte alignment of the loop entry */
5892 L10382:
5893  pxor mm7, mm7 /* zero MM7 (accumulator) */
5894  movd mm6, esi /* save ESI in MM6 */
5895  /* --- 1 */
5896  movq mm1, [esi] /* load 8 bytes of the Src */
5897  movq mm2, mm1 /* copy MM1 into MM2 */
5898  add esi, eax /* move Src pointer 1 row below */
5899  movq mm3, [edx] /* load 4 words of Kernel */
5900  add edx, 8 /* move pointer to other 4 words */
5901  movq mm4, [edx] /* load 4 words of Kernel */
5902  add edx, 8 /* move pointer to other 4 words */
5903  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5904  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5905  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5906  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5907  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5908  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5909  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5910  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5911  /* --- 2 */
5912  movq mm1, [esi] /* load 8 bytes of the Src */
5913  movq mm2, mm1 /* copy MM1 into MM2 */
5914  add esi, eax /* move Src pointer 1 row below */
5915  movq mm3, [edx] /* load 4 words of Kernel */
5916  add edx, 8 /* move pointer to other 4 words */
5917  movq mm4, [edx] /* load 4 words of Kernel */
5918  add edx, 8 /* move pointer to other 4 words */
5919  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5920  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5921  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5922  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5923  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5924  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5925  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5926  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5927  /* --- 3 */
5928  movq mm1, [esi] /* load 8 bytes of the Src */
5929  movq mm2, mm1 /* copy MM1 into MM2 */
5930  add esi, eax /* move Src pointer 1 row below */
5931  movq mm3, [edx] /* load 4 words of Kernel */
5932  add edx, 8 /* move pointer to other 4 words */
5933  movq mm4, [edx] /* load 4 words of Kernel */
5934  add edx, 8 /* move pointer to other 4 words */
5935  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5936  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5937  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5938  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5939  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5940  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5941  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5942  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5943  /* --- 4 */
5944  movq mm1, [esi] /* load 8 bytes of the Src */
5945  movq mm2, mm1 /* copy MM1 into MM2 */
5946  add esi, eax /* move Src pointer 1 row below */
5947  movq mm3, [edx] /* load 4 words of Kernel */
5948  add edx, 8 /* move pointer to other 4 words */
5949  movq mm4, [edx] /* load 4 words of Kernel */
5950  add edx, 8 /* move pointer to other 4 words */
5951  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5952  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5953  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5954  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5955  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5956  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5957  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5958  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5959  /* --- 5 */
5960  movq mm1, [esi] /* load 8 bytes of the Src */
5961  movq mm2, mm1 /* copy MM1 into MM2 */
5962  add esi, eax /* move Src pointer 1 row below */
5963  movq mm3, [edx] /* load 4 words of Kernel */
5964  add edx, 8 /* move pointer to other 4 words */
5965  movq mm4, [edx] /* load 4 words of Kernel */
5966  add edx, 8 /* move pointer to other 4 words */
5967  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5968  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5969  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5970  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5971  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5972  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5973  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5974  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5975  /* --- 6 */
5976  movq mm1, [esi] /* load 8 bytes of the Src */
5977  movq mm2, mm1 /* copy MM1 into MM2 */
5978  add esi, eax /* move Src pointer 1 row below */
5979  movq mm3, [edx] /* load 4 words of Kernel */
5980  add edx, 8 /* move pointer to other 4 words */
5981  movq mm4, [edx] /* load 4 words of Kernel */
5982  add edx, 8 /* move pointer to other 4 words */
5983  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5984  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5985  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5986  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5987  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5988  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5989  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5990  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5991  /* --- 7 */
5992  movq mm1, [esi] /* load 8 bytes of the Src */
5993  movq mm2, mm1 /* copy MM1 into MM2 */
5994  movq mm3, [edx] /* load 4 words of Kernel */
5995  add edx, 8 /* move pointer to other 4 words */
5996  movq mm4, [edx] /* load 4 words of Kernel */
5997  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5998  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5999  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6000  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6001  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6002  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6003  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6004  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6005  /* ---, */
6006  movq mm3, mm7 /* copy MM7 into MM3 */
6007  psrlq mm7, 32 /* shift 2 left words to the right */
6008  paddsw mm7, mm3 /* add 2 left and 2 right result words */
6009  movq mm2, mm7 /* copy MM7 into MM2 */
6010  psrlq mm7, 16 /* shift 1 left word to the right */
6011  paddsw mm7, mm2 /* add 1 left and 1 right result words */
6012  movd mm1, eax /* save EAX in MM1 */
6013  packuswb mm7, mm0 /* pack division result with saturation */
6014  movd eax, mm7 /* copy saturated result into EAX */
6015  mov [edi], al /* copy a byte result into Dest */
6016  movd eax, mm1 /* restore saved EAX */
6017  /* --, */
6018  movd esi, mm6 /* move Src pointer to the top pixel */
6019  sub edx, 104 /* EDX = Kernel address */
6020  inc esi /* move Src pointer to the next pixel */
6021  inc edi /* move Dest pointer to the next pixel */
6022  /* ---, */
6023  dec ecx /* decrease loop counter COLUMNS */
6024  jnz L10382 /* check loop termination, proceed if required */
6025  add esi, 6 /* move to the next row in Src */
6026  add edi, 6 /* move to the next row in Dest */
6027  dec ebx /* decrease loop counter ROWS */
6028  jnz L10380 /* check loop termination, proceed if required */
6029  /* ---, */
6030  emms /* exit MMX state */
6031  popa
6032  }
6033 #else
6034  asm volatile
6035  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6036  "xor %%ebx, %%ebx \n\t" /* zero EBX */
6037  "mov %5, %%bl \n\t" /* load NRightShift into BL */
6038  "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
6039  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
6040  "mov %1, %%esi \n\t" /* load Src address to ESI */
6041  "mov %0, %%edi \n\t" /* load Dest address to EDI */
6042  "add $3, %%edi \n\t" /* 3 column offset from the left edge */
6043  "mov %3, %%eax \n\t" /* load columns into EAX */
6044  "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
6045  "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
6046  "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
6047  /* --- */
6048  ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
6049  "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
6050  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6051  ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
6052  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
6053  /* --- 1 */
6054  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6055  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6056  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6057  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6058  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6059  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6060  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6061  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6062  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6063  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6064  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6065  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6066  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6067  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6068  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6069  /* --- 2 */
6070  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6071  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6072  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6073  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6074  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6075  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6076  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6077  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6078  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6079  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6080  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6081  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6082  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6083  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6084  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6085  /* --- 3 */
6086  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6087  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6088  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6089  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6090  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6091  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6092  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6093  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6094  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6095  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6096  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6097  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6098  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6099  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6100  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6101  /* --- 4 */
6102  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6103  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6104  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6105  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6106  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6107  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6108  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6109  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6110  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6111  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6112  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6113  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6114  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6115  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6116  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6117  /* --- 5 */
6118  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6119  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6120  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6121  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6122  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6123  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6124  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6125  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6126  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6127  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6128  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6129  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6130  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6131  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6132  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6133  /* --- 6 */
6134  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6135  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6136  "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6137  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6138  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6139  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6140  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6141  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6142  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6143  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6144  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6145  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6146  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6147  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6148  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6149  /* --- 7 */
6150  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6151  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6152  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6153  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6154  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6155  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6156  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6157  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6158  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6159  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6160  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6161  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6162  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6163  /* --- */
6164  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6165  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6166  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
6167  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
6168  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
6169  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
6170  "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
6171  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
6172  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
6173  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
6174  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
6175  /* -- */
6176  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
6177  "sub $104, %%edx \n\t" /* EDX = Kernel address */
6178  "inc %%esi \n\t" /* move Src pointer to the next pixel */
6179  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
6180  /* --- */
6181  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
6182  "jnz .L10382 \n\t" /* check loop termination, proceed if required */
6183  "add $6, %%esi \n\t" /* move to the next row in Src */
6184  "add $6, %%edi \n\t" /* move to the next row in Dest */
6185  "dec %%ebx \n\t" /* decrease loop counter ROWS */
6186  "jnz .L10380 \n\t" /* check loop termination, proceed if required */
6187  /* --- */
6188  "emms \n\t" /* exit MMX state */
6189  "popa \n\t":"=m" (Dest) /* %0 */
6190  :"m"(Src), /* %1 */
6191  "m"(rows), /* %2 */
6192  "m"(columns), /* %3 */
6193  "m"(Kernel), /* %4 */
6194  "m"(NRightShift) /* %5 */
6195  );
6196 #endif
6197 #endif
6198  return (0);
6199  } else {
6200  /* No non-MMX implementation yet */
6201  return (-1);
6202  }
6203 }
6204 
6219 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
6220  signed short *Kernel, unsigned char NRightShift)
6221 {
6222  /* Validate input parameters */
6223  if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
6224  return(-1);
6225 
6226  if ((columns < 9) || (rows < 9) || (NRightShift > 7))
6227  return (-1);
6228 
6229  if ((SDL_imageFilterMMXdetect())) {
6230 //#ifdef USE_MMX
6231 #if defined(USE_MMX) && defined(i386)
6232 #if !defined(GCC__)
6233  __asm
6234  {
6235  pusha
6236  pxor mm0, mm0 /* zero MM0 */
6237  xor ebx, ebx /* zero EBX */
6238  mov bl, NRightShift /* load NRightShift into BL */
6239  movd mm5, ebx /* copy NRightShift into MM5 */
6240  mov edx, Kernel /* load Kernel address into EDX */
6241  mov esi, Src /* load Src address to ESI */
6242  mov edi, Dest /* load Dest address to EDI */
6243  add edi, 4 /* 4 column offset from the left edge */
6244  mov eax, columns /* load columns into EAX */
6245  add edi, eax /* 4 row offset from the top edge */
6246  add edi, eax
6247  add edi, eax
6248  add edi, eax
6249  mov ebx, rows /* initialize ROWS counter */
6250  sub ebx, 8 /* do not use first 4 and last 4 rows */
6251  /* ---, */
6252 L10390:
6253  mov ecx, eax /* initialize COLUMNS counter */
6254  sub ecx, 8 /* do not use first 4 and last 4 columns */
6255  align 16 /* 16 byte alignment of the loop entry */
6256 L10392:
6257  pxor mm7, mm7 /* zero MM7 (accumulator) */
6258  movd mm6, esi /* save ESI in MM6 */
6259  /* --- 1 */
6260  movq mm1, [esi] /* load 8 bytes of the Src */
6261  movq mm2, mm1 /* copy MM1 into MM2 */
6262  inc esi /* move pointer to the next 8 bytes of Src */
6263  movq mm3, [edx] /* load 4 words of Kernel */
6264  add edx, 8 /* move pointer to other 4 words */
6265  movq mm4, [edx] /* load 4 words of Kernel */
6266  add edx, 8 /* move pointer to other 4 words */
6267  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6268  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6269  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6270  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6271  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6272  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6273  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6274  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6275  movq mm1, [esi] /* load 8 bytes of the Src */
6276  dec esi
6277  add esi, eax /* move Src pointer 1 row below */
6278  movq mm3, [edx] /* load 4 words of Kernel */
6279  add edx, 8 /* move pointer to other 4 words */
6280  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6281  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6282  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6283  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6284  /* --- 2 */
6285  movq mm1, [esi] /* load 8 bytes of the Src */
6286  movq mm2, mm1 /* copy MM1 into MM2 */
6287  inc esi /* move pointer to the next 8 bytes of Src */
6288  movq mm3, [edx] /* load 4 words of Kernel */
6289  add edx, 8 /* move pointer to other 4 words */
6290  movq mm4, [edx] /* load 4 words of Kernel */
6291  add edx, 8 /* move pointer to other 4 words */
6292  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6293  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6294  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6295  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6296  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6297  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6298  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6299  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6300  movq mm1, [esi] /* load 8 bytes of the Src */
6301  dec esi
6302  add esi, eax /* move Src pointer 1 row below */
6303  movq mm3, [edx] /* load 4 words of Kernel */
6304  add edx, 8 /* move pointer to other 4 words */
6305  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6306  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6307  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6308  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6309  /* --- 3 */
6310  movq mm1, [esi] /* load 8 bytes of the Src */
6311  movq mm2, mm1 /* copy MM1 into MM2 */
6312  inc esi /* move pointer to the next 8 bytes of Src */
6313  movq mm3, [edx] /* load 4 words of Kernel */
6314  add edx, 8 /* move pointer to other 4 words */
6315  movq mm4, [edx] /* load 4 words of Kernel */
6316  add edx, 8 /* move pointer to other 4 words */
6317  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6318  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6319  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6320  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6321  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6322  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6323  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6324  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6325  movq mm1, [esi] /* load 8 bytes of the Src */
6326  dec esi
6327  add esi, eax /* move Src pointer 1 row below */
6328  movq mm3, [edx] /* load 4 words of Kernel */
6329  add edx, 8 /* move pointer to other 4 words */
6330  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6331  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6332  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6333  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6334  /* --- 4 */
6335  movq mm1, [esi] /* load 8 bytes of the Src */
6336  movq mm2, mm1 /* copy MM1 into MM2 */
6337  inc esi /* move pointer to the next 8 bytes of Src */
6338  movq mm3, [edx] /* load 4 words of Kernel */
6339  add edx, 8 /* move pointer to other 4 words */
6340  movq mm4, [edx] /* load 4 words of Kernel */
6341  add edx, 8 /* move pointer to other 4 words */
6342  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6343  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6344  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6345  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6346  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6347  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6348  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6349  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6350  movq mm1, [esi] /* load 8 bytes of the Src */
6351  dec esi
6352  add esi, eax /* move Src pointer 1 row below */
6353  movq mm3, [edx] /* load 4 words of Kernel */
6354  add edx, 8 /* move pointer to other 4 words */
6355  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6356  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6357  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6358  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6359  /* --- 5 */
6360  movq mm1, [esi] /* load 8 bytes of the Src */
6361  movq mm2, mm1 /* copy MM1 into MM2 */
6362  inc esi /* move pointer to the next 8 bytes of Src */
6363  movq mm3, [edx] /* load 4 words of Kernel */
6364  add edx, 8 /* move pointer to other 4 words */
6365  movq mm4, [edx] /* load 4 words of Kernel */
6366  add edx, 8 /* move pointer to other 4 words */
6367  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6368  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6369  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6370  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6371  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6372  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6373  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6374  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6375  movq mm1, [esi] /* load 8 bytes of the Src */
6376  dec esi
6377  add esi, eax /* move Src pointer 1 row below */
6378  movq mm3, [edx] /* load 4 words of Kernel */
6379  add edx, 8 /* move pointer to other 4 words */
6380  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6381  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6382  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6383  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6384  /* --- 6 */
6385  movq mm1, [esi] /* load 8 bytes of the Src */
6386  movq mm2, mm1 /* copy MM1 into MM2 */
6387  inc esi /* move pointer to the next 8 bytes of Src */
6388  movq mm3, [edx] /* load 4 words of Kernel */
6389  add edx, 8 /* move pointer to other 4 words */
6390  movq mm4, [edx] /* load 4 words of Kernel */
6391  add edx, 8 /* move pointer to other 4 words */
6392  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6393  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6394  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6395  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6396  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6397  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6398  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6399  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6400  movq mm1, [esi] /* load 8 bytes of the Src */
6401  dec esi
6402  add esi, eax /* move Src pointer 1 row below */
6403  movq mm3, [edx] /* load 4 words of Kernel */
6404  add edx, 8 /* move pointer to other 4 words */
6405  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6406  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6407  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6408  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6409  /* --- 7 */
6410  movq mm1, [esi] /* load 8 bytes of the Src */
6411  movq mm2, mm1 /* copy MM1 into MM2 */
6412  inc esi /* move pointer to the next 8 bytes of Src */
6413  movq mm3, [edx] /* load 4 words of Kernel */
6414  add edx, 8 /* move pointer to other 4 words */
6415  movq mm4, [edx] /* load 4 words of Kernel */
6416  add edx, 8 /* move pointer to other 4 words */
6417  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6418  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6419  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6420  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6421  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6422  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6423  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6424  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6425  movq mm1, [esi] /* load 8 bytes of the Src */
6426  dec esi
6427  add esi, eax /* move Src pointer 1 row below */
6428  movq mm3, [edx] /* load 4 words of Kernel */
6429  add edx, 8 /* move pointer to other 4 words */
6430  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6431  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6432  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6433  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6434  /* --- 8 */
6435  movq mm1, [esi] /* load 8 bytes of the Src */
6436  movq mm2, mm1 /* copy MM1 into MM2 */
6437  inc esi /* move pointer to the next 8 bytes of Src */
6438  movq mm3, [edx] /* load 4 words of Kernel */
6439  add edx, 8 /* move pointer to other 4 words */
6440  movq mm4, [edx] /* load 4 words of Kernel */
6441  add edx, 8 /* move pointer to other 4 words */
6442  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6443  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6444  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6445  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6446  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6447  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6448  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6449  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6450  movq mm1, [esi] /* load 8 bytes of the Src */
6451  dec esi
6452  add esi, eax /* move Src pointer 1 row below */
6453  movq mm3, [edx] /* load 4 words of Kernel */
6454  add edx, 8 /* move pointer to other 4 words */
6455  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6456  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6457  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6458  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6459  /* --- 9 */
6460  movq mm1, [esi] /* load 8 bytes of the Src */
6461  movq mm2, mm1 /* copy MM1 into MM2 */
6462  inc esi /* move pointer to the next 8 bytes of Src */
6463  movq mm3, [edx] /* load 4 words of Kernel */
6464  add edx, 8 /* move pointer to other 4 words */
6465  movq mm4, [edx] /* load 4 words of Kernel */
6466  add edx, 8 /* move pointer to other 4 words */
6467  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6468  punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6469  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6470  psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6471  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6472  pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6473  paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6474  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6475  movq mm1, [esi] /* load 8 bytes of the Src */
6476  movq mm3, [edx] /* load 4 words of Kernel */
6477  punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6478  psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6479  pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6480  paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6481  /* ---, */
6482  movq mm3, mm7 /* copy MM7 into MM3 */
6483  psrlq mm7, 32 /* shift 2 left words to the right */
6484  paddsw mm7, mm3 /* add 2 left and 2 right result words */
6485  movq mm2, mm7 /* copy MM7 into MM2 */
6486  psrlq mm7, 16 /* shift 1 left word to the right */
6487  paddsw mm7, mm2 /* add 1 left and 1 right result words */
6488  movd mm1, eax /* save EAX in MM1 */
6489  packuswb mm7, mm0 /* pack division result with saturation */
6490  movd eax, mm7 /* copy saturated result into EAX */
6491  mov [edi], al /* copy a byte result into Dest */
6492  movd eax, mm1 /* restore saved EAX */
6493  /* --, */
6494  movd esi, mm6 /* move Src pointer to the top pixel */
6495  sub edx, 208 /* EDX = Kernel address */
6496  inc esi /* move Src pointer to the next pixel */
6497  inc edi /* move Dest pointer to the next pixel */
6498  /* ---, */
6499  dec ecx /* decrease loop counter COLUMNS */
6500  jnz L10392 /* check loop termination, proceed if required */
6501  add esi, 8 /* move to the next row in Src */
6502  add edi, 8 /* move to the next row in Dest */
6503  dec ebx /* decrease loop counter ROWS */
6504  jnz L10390 /* check loop termination, proceed if required */
6505  /* ---, */
6506  emms /* exit MMX state */
6507  popa
6508  }
6509 #else
6510  asm volatile
6511  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6512  "xor %%ebx, %%ebx \n\t" /* zero EBX */
6513  "mov %5, %%bl \n\t" /* load NRightShift into BL */
6514  "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
6515  "mov %4, %%edx \n\t" /* load Kernel address into EDX */
6516  "mov %1, %%esi \n\t" /* load Src address to ESI */
6517  "mov %0, %%edi \n\t" /* load Dest address to EDI */
6518  "add $4, %%edi \n\t" /* 4 column offset from the left edge */
6519  "mov %3, %%eax \n\t" /* load columns into EAX */
6520  "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
6521  "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
6522  "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
6523  /* --- */
6524  ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
6525  "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
6526  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6527  ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
6528  "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
6529  /* --- 1 */
6530  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6531  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6532  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6533  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6534  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6535  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6536  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6537  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6538  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6539  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6540  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6541  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6542  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6543  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6544  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6545  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6546  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6547  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6548  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6549  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6550  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6551  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6552  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6553  /* --- 2 */
6554  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6555  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6556  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6557  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6558  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6559  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6560  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6561  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6562  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6563  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6564  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6565  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6566  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6567  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6568  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6569  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6570  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6571  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6572  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6573  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6574  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6575  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6576  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6577  /* --- 3 */
6578  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6579  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6580  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6581  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6582  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6583  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6584  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6585  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6586  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6587  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6588  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6589  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6590  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6591  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6592  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6593  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6594  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6595  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6596  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6597  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6598  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6599  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6600  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6601  /* --- 4 */
6602  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6603  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6604  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6605  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6606  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6607  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6608  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6609  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6610  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6611  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6612  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6613  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6614  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6615  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6616  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6617  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6618  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6619  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6620  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6621  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6622  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6623  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6624  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6625  /* --- 5 */
6626  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6627  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6628  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6629  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6630  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6631  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6632  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6633  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6634  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6635  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6636  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6637  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6638  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6639  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6640  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6641  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6642  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6643  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6644  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6645  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6646  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6647  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6648  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6649  /* --- 6 */
6650  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6651  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6652  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6653  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6654  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6655  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6656  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6657  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6658  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6659  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6660  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6661  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6662  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6663  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6664  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6665  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6666  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6667  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6668  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6669  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6670  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6671  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6672  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6673  /* --- 7 */
6674  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6675  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6676  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6677  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6678  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6679  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6680  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6681  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6682  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6683  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6684  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6685  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6686  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6687  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6688  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6689  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6690  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6691  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6692  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6693  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6694  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6695  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6696  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6697  /* --- 8 */
6698  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6699  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6700  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6701  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6702  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6703  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6704  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6705  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6706  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6707  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6708  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6709  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6710  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6711  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6712  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6713  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6714  "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6715  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6716  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6717  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6718  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6719  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6720  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6721  /* --- 9 */
6722  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6723  "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6724  "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6725  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6726  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6727  "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6728  "add $8, %%edx \n\t" /* move pointer to other 4 words */
6729  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6730  "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6731  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6732  "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6733  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6734  "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6735  "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6736  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6737  "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6738  "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6739  "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6740  "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6741  "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6742  "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6743  /* --- */
6744  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6745  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6746  "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
6747  "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
6748  "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
6749  "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
6750  "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
6751  "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
6752  "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
6753  "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
6754  "movd %%mm1, %%eax \n\t" /* restore saved EAX */
6755  /* -- */
6756  "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
6757  "sub $208, %%edx \n\t" /* EDX = Kernel address */
6758  "inc %%esi \n\t" /* move Src pointer to the next pixel */
6759  "inc %%edi \n\t" /* move Dest pointer to the next pixel */
6760  /* --- */
6761  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
6762  "jnz .L10392 \n\t" /* check loop termination, proceed if required */
6763  "add $8, %%esi \n\t" /* move to the next row in Src */
6764  "add $8, %%edi \n\t" /* move to the next row in Dest */
6765  "dec %%ebx \n\t" /* decrease loop counter ROWS */
6766  "jnz .L10390 \n\t" /* check loop termination, proceed if required */
6767  /* --- */
6768  "emms \n\t" /* exit MMX state */
6769  "popa \n\t":"=m" (Dest) /* %0 */
6770  :"m"(Src), /* %1 */
6771  "m"(rows), /* %2 */
6772  "m"(columns), /* %3 */
6773  "m"(Kernel), /* %4 */
6774  "m"(NRightShift) /* %5 */
6775  );
6776 #endif
6777 #endif
6778  return (0);
6779  } else {
6780  /* No non-MMX implementation yet */
6781  return (-1);
6782  }
6783 }
6784 
6785 /* ------------------------------------------------------------------------------------ */
6786 
6799 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
6800 {
6801  /* Validate input parameters */
6802  if ((Src == NULL) || (Dest == NULL))
6803  return(-1);
6804 
6805  if ((columns < 8) || (rows < 3))
6806  return (-1);
6807 
6808  if ((SDL_imageFilterMMXdetect())) {
6809 //#ifdef USE_MMX
6810 #if defined(USE_MMX) && defined(i386)
6811 #if !defined(GCC__)
6812  __asm
6813  {
6814  pusha
6815  pxor mm0, mm0 /* zero MM0 */
6816  mov eax, columns /* load columns into EAX */
6817  /* ---, */
6818  mov esi, Src /* ESI = Src row 0 address */
6819  mov edi, Dest /* load Dest address to EDI */
6820  add edi, eax /* EDI = EDI + columns */
6821  inc edi /* 1 byte offset from the left edge */
6822  mov edx, rows /* initialize ROWS counter */
6823  sub edx, 2 /* do not use first and last rows */
6824  /* ---, */
6825 L10400:
6826  mov ecx, eax /* initialize COLUMS counter */
6827  shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
6828  mov ebx, esi /* save ESI in EBX */
6829  movd mm1, edi /* save EDI in MM1 */
6830  align 16 /* 16 byte alignment of the loop entry */
6831 L10402:
6832  /* ---, */
6833  movq mm4, [esi] /* load 8 bytes from Src */
6834  movq mm5, mm4 /* save MM4 in MM5 */
6835  add esi, 2 /* move ESI pointer 2 bytes right */
6836  punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
6837  punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
6838  movq mm6, [esi] /* load 8 bytes from Src */
6839  movq mm7, mm6 /* save MM6 in MM7 */
6840  sub esi, 2 /* move ESI pointer back 2 bytes left */
6841  punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
6842  punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
6843  add esi, eax /* move to the next row of Src */
6844  movq mm2, [esi] /* load 8 bytes from Src */
6845  movq mm3, mm2 /* save MM2 in MM3 */
6846  add esi, 2 /* move ESI pointer 2 bytes right */
6847  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6848  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6849  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6850  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6851  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6852  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6853  movq mm2, [esi] /* load 8 bytes from Src */
6854  movq mm3, mm2 /* save MM2 in MM3 */
6855  sub esi, 2 /* move ESI pointer back 2 bytes left */
6856  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6857  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6858  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6859  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6860  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6861  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6862  add esi, eax /* move to the next row of Src */
6863  movq mm2, [esi] /* load 8 bytes from Src */
6864  movq mm3, mm2 /* save MM2 in MM3 */
6865  add esi, 2 /* move ESI pointer 2 bytes right */
6866  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6867  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6868  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6869  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6870  movq mm2, [esi] /* load 8 bytes from Src */
6871  movq mm3, mm2 /* save MM2 in MM3 */
6872  sub esi, 2 /* move ESI pointer back 2 bytes left */
6873  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6874  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6875  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6876  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6877  /* ---, */
6878  movq mm2, mm4 /* copy MM4 into MM2 */
6879  psrlq mm4, 32 /* shift 2 left words to the right */
6880  psubw mm4, mm2 /* MM4 = MM4 - MM2 */
6881  movq mm3, mm6 /* copy MM6 into MM3 */
6882  psrlq mm6, 32 /* shift 2 left words to the right */
6883  psubw mm6, mm3 /* MM6 = MM6 - MM3 */
6884  punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
6885  movq mm2, mm5 /* copy MM6 into MM2 */
6886  psrlq mm5, 32 /* shift 2 left words to the right */
6887  psubw mm5, mm2 /* MM5 = MM5 - MM2 */
6888  movq mm3, mm7 /* copy MM7 into MM3 */
6889  psrlq mm7, 32 /* shift 2 left words to the right */
6890  psubw mm7, mm3 /* MM7 = MM7 - MM3 */
6891  punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
6892  /* Take abs values of MM4 and MM5 */
6893  movq mm6, mm4 /* copy MM4 into MM6 */
6894  movq mm7, mm5 /* copy MM5 into MM7 */
6895  psraw mm6, 15 /* fill MM6 words with word sign bit */
6896  psraw mm7, 15 /* fill MM7 words with word sign bit */
6897  pxor mm4, mm6 /* take 1's compliment of only neg words */
6898  pxor mm5, mm7 /* take 1's compliment of only neg words */
6899  psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
6900  psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
6901  packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
6902  movq [edi], mm4 /* store result in Dest */
6903  /* ---, */
6904  sub esi, eax /* move to the current top row in Src */
6905  sub esi, eax
6906  add esi, 8 /* move Src pointer to the next 8 pixels */
6907  add edi, 8 /* move Dest pointer to the next 8 pixels */
6908  /* ---, */
6909  dec ecx /* decrease loop counter COLUMNS */
6910  jnz L10402 /* check loop termination, proceed if required */
6911  mov esi, ebx /* restore most left current row Src address */
6912  movd edi, mm1 /* restore most left current row Dest address */
6913  add esi, eax /* move to the next row in Src */
6914  add edi, eax /* move to the next row in Dest */
6915  dec edx /* decrease loop counter ROWS */
6916  jnz L10400 /* check loop termination, proceed if required */
6917  /* ---, */
6918  emms /* exit MMX state */
6919  popa
6920  }
6921 #else
6922  asm volatile
6923  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6924  "mov %3, %%eax \n\t" /* load columns into EAX */
6925  /* --- */
6926  "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
6927  "mov %0, %%edi \n\t" /* load Dest address to EDI */
6928  "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
6929  "inc %%edi \n\t" /* 1 byte offset from the left edge */
6930  "mov %2, %%edx \n\t" /* initialize ROWS counter */
6931  "sub $2, %%edx \n\t" /* do not use first and last rows */
6932  /* --- */
6933  ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
6934  "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
6935  "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
6936  "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */
6937  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6938  ".L10402: \n\t"
6939  /* --- */
6940  "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
6941  "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
6942  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6943  "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
6944  "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
6945  "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
6946  "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
6947  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6948  "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
6949  "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
6950  "add %%eax, %%esi \n\t" /* move to the next row of Src */
6951  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6952  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6953  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6954  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6955  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6956  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6957  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6958  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6959  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6960  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6961  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6962  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6963  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6964  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6965  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6966  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6967  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6968  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6969  "add %%eax, %%esi \n\t" /* move to the next row of Src */
6970  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6971  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6972  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6973  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6974  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6975  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6976  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6977  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6978  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6979  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6980  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6981  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6982  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6983  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6984  /* --- */
6985  "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
6986  "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
6987  "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
6988  "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
6989  "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
6990  "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
6991  "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
6992  "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
6993  "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
6994  "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
6995  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6996  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6997  "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
6998  "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
6999  /* Take abs values of MM4 and MM5 */
7000  "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
7001  "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
7002  "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
7003  "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
7004  "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
7005  "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
7006  "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7007  "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7008  "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
7009  "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
7010  /* --- */
7011  "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
7012  "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
7013  "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
7014  /* --- */
7015  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
7016  "jnz .L10402 \n\t" /* check loop termination, proceed if required */
7017  "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
7018  "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */
7019  "add %%eax, %%esi \n\t" /* move to the next row in Src */
7020  "add %%eax, %%edi \n\t" /* move to the next row in Dest */
7021  "dec %%edx \n\t" /* decrease loop counter ROWS */
7022  "jnz .L10400 \n\t" /* check loop termination, proceed if required */
7023  /* --- */
7024  "emms \n\t" /* exit MMX state */
7025  "popa \n\t":"=m" (Dest) /* %0 */
7026  :"m"(Src), /* %1 */
7027  "m"(rows), /* %2 */
7028  "m"(columns) /* %3 */
7029  );
7030 #endif
7031 #endif
7032  return (0);
7033  } else {
7034  /* No non-MMX implementation yet */
7035  return (-1);
7036  }
7037 }
7038 
7052 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
7053  unsigned char NRightShift)
7054 {
7055  /* Validate input parameters */
7056  if ((Src == NULL) || (Dest == NULL))
7057  return(-1);
7058  if ((columns < 8) || (rows < 3) || (NRightShift > 7))
7059  return (-1);
7060 
7061  if ((SDL_imageFilterMMXdetect())) {
7062 //#ifdef USE_MMX
7063 #if defined(USE_MMX) && defined(i386)
7064 #if !defined(GCC__)
7065  __asm
7066  {
7067  pusha
7068  pxor mm0, mm0 /* zero MM0 */
7069  mov eax, columns /* load columns into EAX */
7070  xor ebx, ebx /* zero EBX */
7071  mov bl, NRightShift /* load NRightShift into BL */
7072  movd mm1, ebx /* copy NRightShift into MM1 */
7073  /* ---, */
7074  mov esi, Src /* ESI = Src row 0 address */
7075  mov edi, Dest /* load Dest address to EDI */
7076  add edi, eax /* EDI = EDI + columns */
7077  inc edi /* 1 byte offset from the left edge */
7078  /* initialize ROWS counter */
7079  sub rows, 2 /* do not use first and last rows */
7080  /* ---, */
7081 L10410:
7082  mov ecx, eax /* initialize COLUMS counter */
7083  shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
7084  mov ebx, esi /* save ESI in EBX */
7085  mov edx, edi /* save EDI in EDX */
7086  align 16 /* 16 byte alignment of the loop entry */
7087 L10412:
7088  /* ---, */
7089  movq mm4, [esi] /* load 8 bytes from Src */
7090  movq mm5, mm4 /* save MM4 in MM5 */
7091  add esi, 2 /* move ESI pointer 2 bytes right */
7092  punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
7093  punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
7094  psrlw mm4, mm1 /* shift right each pixel NshiftRight times */
7095  psrlw mm5, mm1 /* shift right each pixel NshiftRight times */
7096  movq mm6, [esi] /* load 8 bytes from Src */
7097  movq mm7, mm6 /* save MM6 in MM7 */
7098  sub esi, 2 /* move ESI pointer back 2 bytes left */
7099  punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
7100  punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
7101  psrlw mm6, mm1 /* shift right each pixel NshiftRight times */
7102  psrlw mm7, mm1 /* shift right each pixel NshiftRight times */
7103  add esi, eax /* move to the next row of Src */
7104  movq mm2, [esi] /* load 8 bytes from Src */
7105  movq mm3, mm2 /* save MM2 in MM3 */
7106  add esi, 2 /* move ESI pointer 2 bytes right */
7107  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7108  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7109  psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7110  psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7111  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7112  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7113  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7114  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7115  movq mm2, [esi] /* load 8 bytes from Src */
7116  movq mm3, mm2 /* save MM2 in MM3 */
7117  sub esi, 2 /* move ESI pointer back 2 bytes left */
7118  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7119  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7120  psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7121  psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7122  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7123  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7124  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7125  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7126  add esi, eax /* move to the next row of Src */
7127  movq mm2, [esi] /* load 8 bytes from Src */
7128  movq mm3, mm2 /* save MM2 in MM3 */
7129  add esi, 2 /* move ESI pointer 2 bytes right */
7130  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7131  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7132  psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7133  psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7134  paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7135  paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7136  movq mm2, [esi] /* load 8 bytes from Src */
7137  movq mm3, mm2 /* save MM2 in MM3 */
7138  sub esi, 2 /* move ESI pointer back 2 bytes left */
7139  punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7140  punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7141  psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7142  psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7143  paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7144  paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7145  /* ---, */
7146  movq mm2, mm4 /* copy MM4 into MM2 */
7147  psrlq mm4, 32 /* shift 2 left words to the right */
7148  psubw mm4, mm2 /* MM4 = MM4 - MM2 */
7149  movq mm3, mm6 /* copy MM6 into MM3 */
7150  psrlq mm6, 32 /* shift 2 left words to the right */
7151  psubw mm6, mm3 /* MM6 = MM6 - MM3 */
7152  punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
7153  movq mm2, mm5 /* copy MM6 into MM2 */
7154  psrlq mm5, 32 /* shift 2 left words to the right */
7155  psubw mm5, mm2 /* MM5 = MM5 - MM2 */
7156  movq mm3, mm7 /* copy MM7 into MM3 */
7157  psrlq mm7, 32 /* shift 2 left words to the right */
7158  psubw mm7, mm3 /* MM7 = MM7 - MM3 */
7159  punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
7160  /* Take abs values of MM4 and MM5 */
7161  movq mm6, mm4 /* copy MM4 into MM6 */
7162  movq mm7, mm5 /* copy MM5 into MM7 */
7163  psraw mm6, 15 /* fill MM6 words with word sign bit */
7164  psraw mm7, 15 /* fill MM7 words with word sign bit */
7165  pxor mm4, mm6 /* take 1's compliment of only neg words */
7166  pxor mm5, mm7 /* take 1's compliment of only neg words */
7167  psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
7168  psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
7169  packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
7170  movq [edi], mm4 /* store result in Dest */
7171  /* ---, */
7172  sub esi, eax /* move to the current top row in Src */
7173  sub esi, eax
7174  add esi, 8 /* move Src pointer to the next 8 pixels */
7175  add edi, 8 /* move Dest pointer to the next 8 pixels */
7176  /* ---, */
7177  dec ecx /* decrease loop counter COLUMNS */
7178  jnz L10412 /* check loop termination, proceed if required */
7179  mov esi, ebx /* restore most left current row Src address */
7180  mov edi, edx /* restore most left current row Dest address */
7181  add esi, eax /* move to the next row in Src */
7182  add edi, eax /* move to the next row in Dest */
7183  dec rows /* decrease loop counter ROWS */
7184  jnz L10410 /* check loop termination, proceed if required */
7185  /* ---, */
7186  emms /* exit MMX state */
7187  popa
7188  }
7189 #else
7190  asm volatile
7191  ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
7192  "mov %3, %%eax \n\t" /* load columns into EAX */
7193  "xor %%ebx, %%ebx \n\t" /* zero EBX */
7194  "mov %4, %%bl \n\t" /* load NRightShift into BL */
7195  "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */
7196  /* --- */
7197  "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
7198  "mov %0, %%edi \n\t" /* load Dest address to EDI */
7199  "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
7200  "inc %%edi \n\t" /* 1 byte offset from the left edge */
7201  /* initialize ROWS counter */
7202  "subl $2, %2 \n\t" /* do not use first and last rows */
7203  /* --- */
7204  ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
7205  "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
7206  "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
7207  "mov %%edi, %%edx \n\t" /* save EDI in EDX */
7208  ".align 16 \n\t" /* 16 byte alignment of the loop entry */
7209  ".L10412: \n\t"
7210  /* --- */
7211  "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
7212  "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
7213  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7214  "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
7215  "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
7216  "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */
7217  "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */
7218  "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
7219  "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
7220  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7221  "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
7222  "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
7223  "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */
7224  "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */
7225  "add %%eax, %%esi \n\t" /* move to the next row of Src */
7226  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7227  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7228  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7229  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7230  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7231  "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7232  "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7233  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7234  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7235  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7236  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7237  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7238  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7239  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7240  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7241  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7242  "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7243  "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7244  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7245  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7246  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7247  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7248  "add %%eax, %%esi \n\t" /* move to the next row of Src */
7249  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7250  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7251  "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7252  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7253  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7254  "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7255  "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7256  "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7257  "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7258  "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7259  "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7260  "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7261  "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7262  "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7263  "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7264  "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7265  "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7266  "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7267  /* --- */
7268  "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
7269  "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
7270  "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
7271  "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
7272  "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
7273  "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
7274  "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
7275  "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
7276  "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
7277  "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
7278  "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
7279  "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
7280  "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
7281  "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
7282  /* Take abs values of MM4 and MM5 */
7283  "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
7284  "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
7285  "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
7286  "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
7287  "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
7288  "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
7289  "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7290  "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7291  "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
7292  "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
7293  /* --- */
7294  "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
7295  "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
7296  "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
7297  /* --- */
7298  "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
7299  "jnz .L10412 \n\t" /* check loop termination, proceed if required */
7300  "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
7301  "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */
7302  "add %%eax, %%esi \n\t" /* move to the next row in Src */
7303  "add %%eax, %%edi \n\t" /* move to the next row in Dest */
7304  "decl %2 \n\t" /* decrease loop counter ROWS */
7305  "jnz .L10410 \n\t" /* check loop termination, proceed if required */
7306  /* --- */
7307  "emms \n\t" /* exit MMX state */
7308  "popa \n\t":"=m" (Dest) /* %0 */
7309  :"m"(Src), /* %1 */
7310  "m"(rows), /* %2 */
7311  "m"(columns), /* %3 */
7312  "m"(NRightShift) /* %4 */
7313  );
7314 #endif
7315 #endif
7316  return (0);
7317  } else {
7318  /* No non-MMX implementation yet */
7319  return (-1);
7320  }
7321 }
7322 
7327 {
7328 #ifdef USE_MMX
7329 #if !defined(GCC__)
7330  __asm
7331  { /* --- stack alignment --- */
7332  mov ebx, esp /* load ESP into EBX */
7333  sub ebx, 4 /* reserve space on stack for old value of ESP */
7334  and ebx, -32 /* align EBX along a 32 byte boundary */
7335  mov [ebx], esp /* save old value of ESP in stack, behind the bndry */
7336  mov esp, ebx /* align ESP along a 32 byte boundary */
7337  }
7338 #else
7339  asm volatile
7340  ( /* --- stack alignment --- */
7341  "mov %%esp, %%ebx \n\t" /* load ESP into EBX */
7342  "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */
7343  "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */
7344  "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */
7345  "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */
7346  ::);
7347 #endif
7348 #endif
7349 }
7350 
7355 {
7356 #ifdef USE_MMX
7357 #if !defined(GCC__)
7358  __asm
7359  { /* --- restoring old stack --- */
7360  mov ebx, [esp] /* load old value of ESP */
7361  mov esp, ebx /* restore old value of ESP */
7362  }
7363 #else
7364  asm volatile
7365  ( /* --- restoring old stack --- */
7366  "mov (%%esp), %%ebx \n\t" /* load old value of ESP */
7367  "mov %%ebx, %%esp \n\t" /* restore old value of ESP */
7368  ::);
7369 #endif
7370 #endif
7371 }
int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
Filter using SobelX: Dij = saturation255( ... )
int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using MultByByte: D = saturation255(S * C)
int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, unsigned char C)
Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C)
int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, unsigned char NRightShift)
Filter using SobelXShiftRight: Dij = saturation255( ... )
int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Div: D = S1 / S2.
int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftUint: D = ((uint)S << N)
int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
void SDL_imageFilterRestoreStack(void)
Restore previously aligned stack.
int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
Internal ASM Filter using MultNor: D = S1 * S2.
void SDL_imageFilterMMXon()
Enable MMX check for filter functions and use MMX code if available.
int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... )
int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using SubByte: D = saturation0(S - C)
int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Sub: D = saturation0(S1 - S2)
int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftByte: D = (S << N)
int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using AbsDiff: D = | S1 - S2 |.
int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitOr: D = S1 | S2.
void SDL_imageFilterMMXoff()
Disable MMX check for filter functions and and force to use non-MMX C based code.
int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultNor: D = S1 * S2.
int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
Internal MMX Filter using SubByte: D = saturation0(S - C)
int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRight: D = saturation0(S >> N)
int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... )
#define SWAP_32(x)
Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
int SDL_imageFilterMMXdetect(void)
MMX detection routine (with override flag).
int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby2: D = saturation255(S1/2 * S2)
int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByte: D = saturation255(S + C)
int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitAnd: D = S1 & S2.
int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... )
int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0.
int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter ShiftLeft: D = saturation255(S << N)
int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Add: D = saturation255(S1 + S2)
int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, int Nmax)
Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, unsigned char Tmax)
Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax.
int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByteToHalf: D = saturation255(S/2 + C)
int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
Filter using BitNegation: D = !S.
int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... )
int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mean: D = S1/2 + S2/2.
int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... )
int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mult: D = saturation255(S1 * S2)
void SDL_imageFilterAlignStack(void)
Align stack to 32 byte boundary,.