add some NEON 32bpp blitters
[sdl_omap.git] / src / video / SDL_blit_A.c
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2009 Sam Lantinga
4
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 #include "SDL_video.h"
25 #include "SDL_blit.h"
26
27 /*
28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29    Checking if _mm_free is #defined in malloc.h is is the only way to
30    determine if the Processor Pack is installed, as far as I can tell.
31 */
32
33 #if SDL_ASSEMBLY_ROUTINES
34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 #    define MMX_ASMBLIT 1
36 #    define GCC_ASMBLIT 1
37 #  elif defined(_MSC_VER) && defined(_M_IX86)
38 #    if (_MSC_VER <= 1200)  
39 #      include <malloc.h>   
40 #      if defined(_mm_free)
41 #          define HAVE_MMINTRIN_H 1
42 #      endif
43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
44 #      define HAVE_MMINTRIN_H 1
45 #    endif
46 #    if HAVE_MMINTRIN_H
47 #      define MMX_ASMBLIT 1
48 #      define MSVC_ASMBLIT 1
49 #    endif
50 #  endif
51 #endif /* SDL_ASSEMBLY_ROUTINES */
52
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
55 #if GCC_ASMBLIT
56 #include "mmx.h"
57 #elif MSVC_ASMBLIT
58 #include <mmintrin.h>
59 #include <mm3dnow.h>
60 #endif
61
62 /* Functions to perform alpha blended blitting */
63
64 #ifdef __ARM_NEON__
65
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
70 { \
71         int width = info->d_width; \
72         int height = info->d_height; \
73         Uint8 *src = info->s_pixels; \
74         Uint8 *dst = info->d_pixels; \
75         int srcskip = info->s_skip; \
76         int dstskip = info->d_skip; \
77 \
78         while ( height-- ) { \
79             neon_name(dst, src, width); \
80             src += width * 4 + srcskip; \
81             dst += width * 4 + dstskip; \
82         } \
83 }
84
85 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
86 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
87
88 #endif /* __ARM_NEON__ */
89
90 /* N->1 blending with per-surface alpha */
91 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
92 {
93         int width = info->d_width;
94         int height = info->d_height;
95         Uint8 *src = info->s_pixels;
96         int srcskip = info->s_skip;
97         Uint8 *dst = info->d_pixels;
98         int dstskip = info->d_skip;
99         Uint8 *palmap = info->table;
100         SDL_PixelFormat *srcfmt = info->src;
101         SDL_PixelFormat *dstfmt = info->dst;
102         int srcbpp = srcfmt->BytesPerPixel;
103
104         const unsigned A = srcfmt->alpha;
105
106         while ( height-- ) {
107             DUFFS_LOOP4(
108             {
109                 Uint32 Pixel;
110                 unsigned sR;
111                 unsigned sG;
112                 unsigned sB;
113                 unsigned dR;
114                 unsigned dG;
115                 unsigned dB;
116                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
117                 dR = dstfmt->palette->colors[*dst].r;
118                 dG = dstfmt->palette->colors[*dst].g;
119                 dB = dstfmt->palette->colors[*dst].b;
120                 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
121                 dR &= 0xff;
122                 dG &= 0xff;
123                 dB &= 0xff;
124                 /* Pack RGB into 8bit pixel */
125                 if ( palmap == NULL ) {
126                     *dst =((dR>>5)<<(3+2))|
127                           ((dG>>5)<<(2))|
128                           ((dB>>6)<<(0));
129                 } else {
130                     *dst = palmap[((dR>>5)<<(3+2))|
131                                   ((dG>>5)<<(2))  |
132                                   ((dB>>6)<<(0))];
133                 }
134                 dst++;
135                 src += srcbpp;
136             },
137             width);
138             src += srcskip;
139             dst += dstskip;
140         }
141 }
142
143 /* N->1 blending with pixel alpha */
144 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
145 {
146         int width = info->d_width;
147         int height = info->d_height;
148         Uint8 *src = info->s_pixels;
149         int srcskip = info->s_skip;
150         Uint8 *dst = info->d_pixels;
151         int dstskip = info->d_skip;
152         Uint8 *palmap = info->table;
153         SDL_PixelFormat *srcfmt = info->src;
154         SDL_PixelFormat *dstfmt = info->dst;
155         int srcbpp = srcfmt->BytesPerPixel;
156
157         /* FIXME: fix alpha bit field expansion here too? */
158         while ( height-- ) {
159             DUFFS_LOOP4(
160             {
161                 Uint32 Pixel;
162                 unsigned sR;
163                 unsigned sG;
164                 unsigned sB;
165                 unsigned sA;
166                 unsigned dR;
167                 unsigned dG;
168                 unsigned dB;
169                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
170                 dR = dstfmt->palette->colors[*dst].r;
171                 dG = dstfmt->palette->colors[*dst].g;
172                 dB = dstfmt->palette->colors[*dst].b;
173                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
174                 dR &= 0xff;
175                 dG &= 0xff;
176                 dB &= 0xff;
177                 /* Pack RGB into 8bit pixel */
178                 if ( palmap == NULL ) {
179                     *dst =((dR>>5)<<(3+2))|
180                           ((dG>>5)<<(2))|
181                           ((dB>>6)<<(0));
182                 } else {
183                     *dst = palmap[((dR>>5)<<(3+2))|
184                                   ((dG>>5)<<(2))  |
185                                   ((dB>>6)<<(0))  ];
186                 }
187                 dst++;
188                 src += srcbpp;
189             },
190             width);
191             src += srcskip;
192             dst += dstskip;
193         }
194 }
195
196 /* colorkeyed N->1 blending with per-surface alpha */
197 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
198 {
199         int width = info->d_width;
200         int height = info->d_height;
201         Uint8 *src = info->s_pixels;
202         int srcskip = info->s_skip;
203         Uint8 *dst = info->d_pixels;
204         int dstskip = info->d_skip;
205         Uint8 *palmap = info->table;
206         SDL_PixelFormat *srcfmt = info->src;
207         SDL_PixelFormat *dstfmt = info->dst;
208         int srcbpp = srcfmt->BytesPerPixel;
209         Uint32 ckey = srcfmt->colorkey;
210
211         const int A = srcfmt->alpha;
212
213         while ( height-- ) {
214             DUFFS_LOOP(
215             {
216                 Uint32 Pixel;
217                 unsigned sR;
218                 unsigned sG;
219                 unsigned sB;
220                 unsigned dR;
221                 unsigned dG;
222                 unsigned dB;
223                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
224                 if ( Pixel != ckey ) {
225                     dR = dstfmt->palette->colors[*dst].r;
226                     dG = dstfmt->palette->colors[*dst].g;
227                     dB = dstfmt->palette->colors[*dst].b;
228                     ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
229                     dR &= 0xff;
230                     dG &= 0xff;
231                     dB &= 0xff;
232                     /* Pack RGB into 8bit pixel */
233                     if ( palmap == NULL ) {
234                         *dst =((dR>>5)<<(3+2))|
235                               ((dG>>5)<<(2)) |
236                               ((dB>>6)<<(0));
237                     } else {
238                         *dst = palmap[((dR>>5)<<(3+2))|
239                                       ((dG>>5)<<(2))  |
240                                       ((dB>>6)<<(0))  ];
241                     }
242                 }
243                 dst++;
244                 src += srcbpp;
245             },
246             width);
247             src += srcskip;
248             dst += dstskip;
249         }
250 }
251
252 #if GCC_ASMBLIT
253 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
254 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
255 {
256         int width = info->d_width;
257         int height = info->d_height;
258         Uint32 *srcp = (Uint32 *)info->s_pixels;
259         int srcskip = info->s_skip >> 2;
260         Uint32 *dstp = (Uint32 *)info->d_pixels;
261         int dstskip = info->d_skip >> 2;
262         Uint32 dalpha = info->dst->Amask;
263         Uint64 load;
264
265         load = 0x00fefefe00fefefeULL;/* alpha128 mask */
266         movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
267         load = 0x0001010100010101ULL;/* !alpha128 mask */
268         movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
269         movd_m2r(dalpha, mm7); /* dst alpha mask */
270         punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
271         while(height--) {
272                 DUFFS_LOOP_DOUBLE2(
273                 {
274                         Uint32 s = *srcp++;
275                         Uint32 d = *dstp;
276                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
277                                    + (s & d & 0x00010101)) | dalpha;
278                 },{
279                         movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
280                         movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
281
282                         movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
283                         movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
284
285                         pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
286                         pand_r2r(mm4, mm5); /* src & mask -> mm5 */
287                         paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
288                         pand_r2r(mm1, mm2); /* src & dst -> mm2 */
289                         psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
290                         pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
291                         paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
292                         
293                         por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
294                         movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
295                         dstp += 2;
296                         srcp += 2;
297                 }, width);
298                 srcp += srcskip;
299                 dstp += dstskip;
300         }
301         emms();
302 }
303
304 /* fast RGB888->(A)RGB888 blending with surface alpha */
305 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
306 {
307         SDL_PixelFormat* df = info->dst;
308         unsigned alpha = info->src->alpha;
309
310         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
311                         /* only call a128 version when R,G,B occupy lower bits */
312                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
313         } else {
314                 int width = info->d_width;
315                 int height = info->d_height;
316                 Uint32 *srcp = (Uint32 *)info->s_pixels;
317                 int srcskip = info->s_skip >> 2;
318                 Uint32 *dstp = (Uint32 *)info->d_pixels;
319                 int dstskip = info->d_skip >> 2;
320
321                 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
322                 /* form the alpha mult */
323                 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
324                 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
325                 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
326                 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
327                 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
328                 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
329                 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
330                         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
331                 movd_m2r(df->Amask, mm7); /* dst alpha mask */
332                 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
333                 
334                 while(height--) {
335                         DUFFS_LOOP_DOUBLE2({
336                                 /* One Pixel Blend */
337                                 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
338                                 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
339                                 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
340                                 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
341
342                                 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
343                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
344                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
345                                 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
346
347                                 packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
348                                 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
349                                 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
350                                 ++srcp;
351                                 ++dstp;
352                         },{
353                                 /* Two Pixels Blend */
354                                 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
355                                 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
356                                 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
357                                 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
358
359                                 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
360                                 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
361                                 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
362                                 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
363
364                                 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
365                                 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
366                                 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
367                                 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
368
369                                 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
370                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
371                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
372                                 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
373
374                                 packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
375                                 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
376                                 
377                                 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
378
379                                 srcp += 2;
380                                 dstp += 2;
381                         }, width);
382                         srcp += srcskip;
383                         dstp += dstskip;
384                 }
385                 emms();
386         }
387 }
388
389 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
390 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
391 {
392         int width = info->d_width;
393         int height = info->d_height;
394         Uint32 *srcp = (Uint32 *)info->s_pixels;
395         int srcskip = info->s_skip >> 2;
396         Uint32 *dstp = (Uint32 *)info->d_pixels;
397         int dstskip = info->d_skip >> 2;
398         SDL_PixelFormat* sf = info->src;
399         Uint32 amask = sf->Amask;
400
401         pxor_r2r(mm6, mm6); /* 0 -> mm6 */
402         /* form multiplication mask */
403         movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
404         punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
405         pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
406         movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
407         pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
408         /* form channel masks */
409         movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
410         packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
411         packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
412         pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
413         /* get alpha channel shift */
414         __asm__ __volatile__ (
415                 "movd %0, %%mm5"
416                 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
417
418         while(height--) {
419             DUFFS_LOOP4({
420                 Uint32 alpha = *srcp & amask;
421                 /* FIXME: Here we special-case opaque alpha since the
422                         compositioning used (>>8 instead of /255) doesn't handle
423                         it correctly. Also special-case alpha=0 for speed?
424                         Benchmark this! */
425                 if(alpha == 0) {
426                         /* do nothing */
427                 } else if(alpha == amask) {
428                         /* opaque alpha -- copy RGB, keep dst alpha */
429                         /* using MMX here to free up regular registers for other things */
430                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
431                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
432                         pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
433                         pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
434                         por_r2r(mm1, mm2); /* src | dst -> mm2 */
435                         movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
436                 } else {
437                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
438                         punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
439
440                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
441                         punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
442
443                         __asm__ __volatile__ (
444                                 "movd %0, %%mm4"
445                                 : : "r" (alpha) ); /* 0000A000 -> mm4 */
446                         psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
447                         punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
448                         punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
449                         pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
450
451                         /* blend */                 
452                         psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
453                         pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
454                         psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
455                         paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
456                         
457                         packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
458                         movd_r2m(mm2, *dstp);/* mm2 -> dst */
459                 }
460                 ++srcp;
461                 ++dstp;
462             }, width);
463             srcp += srcskip;
464             dstp += dstskip;
465         }
466         emms();
467 }
468 /* End GCC_ASMBLIT */
469
470 #elif MSVC_ASMBLIT
471 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
472 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
473 {
474         int width = info->d_width;
475         int height = info->d_height;
476         Uint32 *srcp = (Uint32 *)info->s_pixels;
477         int srcskip = info->s_skip >> 2;
478         Uint32 *dstp = (Uint32 *)info->d_pixels;
479         int dstskip = info->d_skip >> 2;
480         Uint32 dalpha = info->dst->Amask;
481
482         __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
483         
484         hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
485         lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
486         dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
487
488         while (height--) {
489                 int n = width;
490                 if ( n & 1 ) {
491                         Uint32 s = *srcp++;
492                         Uint32 d = *dstp;
493                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
494                                    + (s & d & 0x00010101)) | dalpha;
495                         n--;
496                 }
497                 
498                 for (n >>= 1; n > 0; --n) {
499                         dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
500                         dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
501
502                         src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
503                         src2 = src1; /* 2 x src -> src2(ARGBARGB) */
504
505                         dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
506                         src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
507                         src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
508                         src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
509
510                         dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
511                         dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
512                         dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
513                         dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
514                         
515                         *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
516                         dstp += 2;
517                         srcp += 2;
518                 }
519                 
520                 srcp += srcskip;
521                 dstp += dstskip;
522         }
523         _mm_empty();
524 }
525
526 /* fast RGB888->(A)RGB888 blending with surface alpha */
527 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
528 {
529         SDL_PixelFormat* df = info->dst;
530         Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
531         unsigned alpha = info->src->alpha;
532
533         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
534                         /* only call a128 version when R,G,B occupy lower bits */
535                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
536         } else {
537                 int width = info->d_width;
538                 int height = info->d_height;
539                 Uint32 *srcp = (Uint32 *)info->s_pixels;
540                 int srcskip = info->s_skip >> 2;
541                 Uint32 *dstp = (Uint32 *)info->d_pixels;
542                 int dstskip = info->d_skip >> 2;
543                 Uint32 dalpha = df->Amask;
544                 Uint32 amult;
545
546                 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
547                 
548                 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
549                 /* form the alpha mult */
550                 amult = alpha | (alpha << 8);
551                 amult = amult | (amult << 16);
552                 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
553                 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
554                 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
555                         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
556                 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
557                 
558                 while (height--) {
559                         int n = width;
560                         if (n & 1) {
561                                 /* One Pixel Blend */
562                                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
563                                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
564
565                                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
566                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
567
568                                 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
569                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
570                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
571                                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
572                                 
573                                 dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
574                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
575                                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
576
577                                 ++srcp;
578                                 ++dstp;
579                                 
580                                 n--;
581                         }
582
583                         for (n >>= 1; n > 0; --n) {
584                                 /* Two Pixels Blend */
585                                 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
586                                 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
587                                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
588                                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
589
590                                 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
591                                 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
592                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
593                                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
594
595                                 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
596                                 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
597                                 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
598                                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
599
600                                 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
601                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
602                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
603                                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
604                                 
605                                 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
606                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
607
608                                 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
609
610                                 srcp += 2;
611                                 dstp += 2;
612                         }
613                         srcp += srcskip;
614                         dstp += dstskip;
615                 }
616                 _mm_empty();
617         }
618 }
619
620 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
621 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
622 {
623         int width = info->d_width;
624         int height = info->d_height;
625         Uint32 *srcp = (Uint32 *)info->s_pixels;
626         int srcskip = info->s_skip >> 2;
627         Uint32 *dstp = (Uint32 *)info->d_pixels;
628         int dstskip = info->d_skip >> 2;
629         SDL_PixelFormat* sf = info->src;
630         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
631         Uint32 amask = sf->Amask;
632         Uint32 ashift = sf->Ashift;
633         Uint64 multmask;
634
635         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
636
637         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
638         multmask = ~(0xFFFFi64 << (ashift * 2));
639         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
640
641         while(height--) {
642                 DUFFS_LOOP4({
643                 Uint32 alpha = *srcp & amask;
644                 if (alpha == 0) {
645                         /* do nothing */
646                 } else if (alpha == amask) {
647                         /* opaque alpha -- copy RGB, keep dst alpha */
648                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
649                 } else {
650                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
651                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
652
653                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
654                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
655
656                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
657                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
658                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
659                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
660                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
661
662                         /* blend */                 
663                         src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
664                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
665                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
666                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
667                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
668                         
669                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
670                 }
671                 ++srcp;
672                 ++dstp;
673             }, width);
674             srcp += srcskip;
675             dstp += dstskip;
676         }
677         _mm_empty();
678 }
679 /* End MSVC_ASMBLIT */
680
681 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
682
683 #if SDL_ALTIVEC_BLITTERS
684 #if __MWERKS__
685 #pragma altivec_model on
686 #endif
687 #if HAVE_ALTIVEC_H
688 #include <altivec.h>
689 #endif
690 #include <assert.h>
691
692 #if (defined(__MACOSX__) && (__GNUC__ < 4))
693     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
694         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
695     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
696         (vector unsigned short) ( a,b,c,d,e,f,g,h )
697 #else
698     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
699         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
700     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
701         (vector unsigned short) { a,b,c,d,e,f,g,h }
702 #endif
703
704 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
705 #define VECPRINT(msg, v) do { \
706     vector unsigned int tmpvec = (vector unsigned int)(v); \
707     unsigned int *vp = (unsigned int *)&tmpvec; \
708     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
709 } while (0)
710
711 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
712     (vector unsigned char)(
713         0x00, 0x10, 0x02, 0x12,
714         0x04, 0x14, 0x06, 0x16,
715         0x08, 0x18, 0x0A, 0x1A,
716         0x0C, 0x1C, 0x0E, 0x1E );
717 */
718 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
719 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
720 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
721 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
722     ? vec_lvsl(0, src) \
723     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
724
725    
726 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
727     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
728     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
729     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
730     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
731     /* valpha2 is 255-alpha */ \
732     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
733     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
734     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
735     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
736     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
737     /* add source and dest */ \
738     vtemp1 = vec_add(vtemp1, vtemp3); \
739     vtemp2 = vec_add(vtemp2, vtemp4); \
740     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
741     vtemp1 = vec_add(vtemp1, v1_16); \
742     vtemp3 = vec_sr(vtemp1, v8_16); \
743     vtemp1 = vec_add(vtemp1, vtemp3); \
744     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
745     vtemp2 = vec_add(vtemp2, v1_16); \
746     vtemp4 = vec_sr(vtemp2, v8_16); \
747     vtemp2 = vec_add(vtemp2, vtemp4); \
748     /* (>>8) and get ARGBARGBARGBARGB */ \
749     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
750 } while (0)
751  
752 /* Calculate the permute vector used for 32->32 swizzling */
753 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
754                                   const SDL_PixelFormat *dstfmt)
755 {
756     /*
757      * We have to assume that the bits that aren't used by other
758      *  colors is alpha, and it's one complete byte, since some formats
759      *  leave alpha with a zero mask, but we should still swizzle the bits.
760      */
761     /* ARGB */
762     const static struct SDL_PixelFormat default_pixel_format = {
763         NULL, 0, 0,
764         0, 0, 0, 0,
765         16, 8, 0, 24,
766         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
767         0, 0};
768     if (!srcfmt) {
769         srcfmt = &default_pixel_format;
770     }
771     if (!dstfmt) {
772         dstfmt = &default_pixel_format;
773     }
774     const vector unsigned char plus = VECUINT8_LITERAL
775                                             ( 0x00, 0x00, 0x00, 0x00,
776                                               0x04, 0x04, 0x04, 0x04,
777                                               0x08, 0x08, 0x08, 0x08,
778                                               0x0C, 0x0C, 0x0C, 0x0C );
779     vector unsigned char vswiz;
780     vector unsigned int srcvec;
781 #define RESHIFT(X) (3 - ((X) >> 3))
782     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
783     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
784     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
785     Uint32 amask;
786     /* Use zero for alpha if either surface doesn't have alpha */
787     if (dstfmt->Amask) {
788         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
789     } else {
790         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
791     }
792 #undef RESHIFT  
793     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
794     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
795     return(vswiz);
796 }
797
798 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
799 {
800     int height = info->d_height;
801     Uint8 *src = (Uint8 *)info->s_pixels;
802     int srcskip = info->s_skip;
803     Uint8 *dst = (Uint8 *)info->d_pixels;
804     int dstskip = info->d_skip;
805     SDL_PixelFormat *srcfmt = info->src;
806
807     vector unsigned char v0 = vec_splat_u8(0);
808     vector unsigned short v8_16 = vec_splat_u16(8);
809     vector unsigned short v1_16 = vec_splat_u16(1);
810     vector unsigned short v2_16 = vec_splat_u16(2);
811     vector unsigned short v3_16 = vec_splat_u16(3);
812     vector unsigned int v8_32 = vec_splat_u32(8);
813     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
814     vector unsigned short v3f = VECUINT16_LITERAL(
815         0x003f, 0x003f, 0x003f, 0x003f,
816         0x003f, 0x003f, 0x003f, 0x003f);
817     vector unsigned short vfc = VECUINT16_LITERAL(
818         0x00fc, 0x00fc, 0x00fc, 0x00fc,
819         0x00fc, 0x00fc, 0x00fc, 0x00fc);
820
821     /* 
822         0x10 - 0x1f is the alpha
823         0x00 - 0x0e evens are the red
824         0x01 - 0x0f odds are zero
825     */
826     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
827         0x10, 0x00, 0x01, 0x01,
828         0x10, 0x02, 0x01, 0x01,
829         0x10, 0x04, 0x01, 0x01,
830         0x10, 0x06, 0x01, 0x01
831     );
832     vector unsigned char vredalpha2 = (vector unsigned char)(
833         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
834     );
835     /*
836         0x00 - 0x0f is ARxx ARxx ARxx ARxx
837         0x11 - 0x0f odds are blue
838     */
839     vector unsigned char vblue1 = VECUINT8_LITERAL(
840         0x00, 0x01, 0x02, 0x11,
841         0x04, 0x05, 0x06, 0x13,
842         0x08, 0x09, 0x0a, 0x15,
843         0x0c, 0x0d, 0x0e, 0x17
844     );
845     vector unsigned char vblue2 = (vector unsigned char)(
846         vec_add((vector unsigned int)vblue1, v8_32)
847     );
848     /*
849         0x00 - 0x0f is ARxB ARxB ARxB ARxB
850         0x10 - 0x0e evens are green
851     */
852     vector unsigned char vgreen1 = VECUINT8_LITERAL(
853         0x00, 0x01, 0x10, 0x03,
854         0x04, 0x05, 0x12, 0x07,
855         0x08, 0x09, 0x14, 0x0b,
856         0x0c, 0x0d, 0x16, 0x0f
857     );
858     vector unsigned char vgreen2 = (vector unsigned char)(
859         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
860     );
861     vector unsigned char vgmerge = VECUINT8_LITERAL(
862         0x00, 0x02, 0x00, 0x06,
863         0x00, 0x0a, 0x00, 0x0e,
864         0x00, 0x12, 0x00, 0x16,
865         0x00, 0x1a, 0x00, 0x1e);
866     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
867     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
868     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
869
870     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
871     vf800 = vec_sl(vf800, vec_splat_u16(8));
872
873     while(height--) {
874         int extrawidth;
875         vector unsigned char valigner;
876         vector unsigned char vsrc;
877         vector unsigned char voverflow;
878         int width = info->d_width;
879
880 #define ONE_PIXEL_BLEND(condition, widthvar) \
881         while (condition) { \
882             Uint32 Pixel; \
883             unsigned sR, sG, sB, dR, dG, dB, sA; \
884             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
885             if(sA) { \
886                 unsigned short dstpixel = *((unsigned short *)dst); \
887                 dR = (dstpixel >> 8) & 0xf8; \
888                 dG = (dstpixel >> 3) & 0xfc; \
889                 dB = (dstpixel << 3) & 0xf8; \
890                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
891                 *((unsigned short *)dst) = ( \
892                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
893                 ); \
894             } \
895             src += 4; \
896             dst += 2; \
897             widthvar--; \
898         }
899         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
900         extrawidth = (width % 8);
901         valigner = VEC_ALIGNER(src);
902         vsrc = (vector unsigned char)vec_ld(0, src);
903         width -= extrawidth;
904         while (width) {
905             vector unsigned char valpha;
906             vector unsigned char vsrc1, vsrc2;
907             vector unsigned char vdst1, vdst2;
908             vector unsigned short vR, vG, vB;
909             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
910
911             /* Load 8 pixels from src as ARGB */
912             voverflow = (vector unsigned char)vec_ld(15, src);
913             vsrc = vec_perm(vsrc, voverflow, valigner);
914             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
915             src += 16;
916             vsrc = (vector unsigned char)vec_ld(15, src);
917             voverflow = vec_perm(voverflow, vsrc, valigner);
918             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
919             src += 16;
920
921             /* Load 8 pixels from dst as XRGB */
922             voverflow = vec_ld(0, dst);
923             vR = vec_and((vector unsigned short)voverflow, vf800);
924             vB = vec_sl((vector unsigned short)voverflow, v3_16);
925             vG = vec_sl(vB, v2_16);
926             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
927             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
928             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
929             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
930             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
931             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
932
933             /* Alpha blend 8 pixels as ARGB */
934             valpha = vec_perm(vsrc1, v0, valphaPermute);
935             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
936             valpha = vec_perm(vsrc2, v0, valphaPermute);
937             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
938
939             /* Convert 8 pixels to 565 */
940             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
941             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
942             vgpixel = vec_and(vgpixel, vfc);
943             vgpixel = vec_sl(vgpixel, v3_16);
944             vrpixel = vec_sl(vpixel, v1_16);
945             vrpixel = vec_and(vrpixel, vf800);
946             vbpixel = vec_and(vpixel, v3f);
947             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
948             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
949             
950             /* Store 8 pixels */
951             vec_st(vdst1, 0, dst);
952
953             width -= 8;
954             dst += 16;
955         }
956         ONE_PIXEL_BLEND((extrawidth), extrawidth);
957 #undef ONE_PIXEL_BLEND
958         src += srcskip;
959         dst += dstskip;
960     }
961 }
962
963 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
964 {
965     unsigned alpha = info->src->alpha;
966     int height = info->d_height;
967     Uint32 *srcp = (Uint32 *)info->s_pixels;
968     int srcskip = info->s_skip >> 2;
969     Uint32 *dstp = (Uint32 *)info->d_pixels;
970     int dstskip = info->d_skip >> 2;
971     SDL_PixelFormat *srcfmt = info->src;
972     SDL_PixelFormat *dstfmt = info->dst;
973     unsigned sA = srcfmt->alpha;
974     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
975     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
976     Uint32 ckey = info->src->colorkey;
977     vector unsigned char mergePermute;
978     vector unsigned char vsrcPermute;
979     vector unsigned char vdstPermute;
980     vector unsigned char vsdstPermute;
981     vector unsigned char valpha;
982     vector unsigned char valphamask;
983     vector unsigned char vbits;
984     vector unsigned char v0;
985     vector unsigned short v1;
986     vector unsigned short v8;
987     vector unsigned int vckey;
988     vector unsigned int vrgbmask;
989
990     mergePermute = VEC_MERGE_PERMUTE();
991     v0 = vec_splat_u8(0);
992     v1 = vec_splat_u16(1);
993     v8 = vec_splat_u16(8);
994
995     /* set the alpha to 255 on the destination surf */
996     valphamask = VEC_ALPHA_MASK();
997
998     vsrcPermute = calc_swizzle32(srcfmt, NULL);
999     vdstPermute = calc_swizzle32(NULL, dstfmt);
1000     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1001
1002     /* set a vector full of alpha and 255-alpha */
1003     ((unsigned char *)&valpha)[0] = alpha;
1004     valpha = vec_splat(valpha, 0);
1005     vbits = (vector unsigned char)vec_splat_s8(-1);
1006
1007     ckey &= rgbmask;
1008     ((unsigned int *)(char*)&vckey)[0] = ckey;
1009     vckey = vec_splat(vckey, 0);
1010     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1011     vrgbmask = vec_splat(vrgbmask, 0);
1012
1013     while(height--) {
1014         int width = info->d_width;
1015 #define ONE_PIXEL_BLEND(condition, widthvar) \
1016         while (condition) { \
1017             Uint32 Pixel; \
1018             unsigned sR, sG, sB, dR, dG, dB; \
1019             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1020             if(sA && Pixel != ckey) { \
1021                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1022                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1023                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1024                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1025             } \
1026             dstp++; \
1027             srcp++; \
1028             widthvar--; \
1029         }
1030         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1031         if (width > 0) {
1032             int extrawidth = (width % 4);
1033             vector unsigned char valigner = VEC_ALIGNER(srcp);
1034             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1035             width -= extrawidth;
1036             while (width) {
1037                 vector unsigned char vsel;
1038                 vector unsigned char voverflow;
1039                 vector unsigned char vd;
1040                 vector unsigned char vd_orig;
1041
1042                 /* s = *srcp */
1043                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1044                 vs = vec_perm(vs, voverflow, valigner);
1045                 
1046                 /* vsel is set for items that match the key */
1047                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1048                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1049
1050                 /* permute to source format */
1051                 vs = vec_perm(vs, valpha, vsrcPermute);
1052
1053                 /* d = *dstp */
1054                 vd = (vector unsigned char)vec_ld(0, dstp);
1055                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1056
1057                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1058
1059                 /* set the alpha channel to full on */
1060                 vd = vec_or(vd, valphamask);
1061
1062                 /* mask out color key */
1063                 vd = vec_sel(vd, vd_orig, vsel);
1064                 
1065                 /* permute to dest format */
1066                 vd = vec_perm(vd, vbits, vdstPermute);
1067
1068                 /* *dstp = res */
1069                 vec_st((vector unsigned int)vd, 0, dstp);
1070                 
1071                 srcp += 4;
1072                 dstp += 4;
1073                 width -= 4;
1074                 vs = voverflow;
1075             }
1076             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1077         }
1078 #undef ONE_PIXEL_BLEND
1079  
1080         srcp += srcskip;
1081         dstp += dstskip;
1082     }
1083 }
1084
1085
1086 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1087 {
1088     int width = info->d_width;
1089     int height = info->d_height;
1090     Uint32 *srcp = (Uint32 *)info->s_pixels;
1091     int srcskip = info->s_skip >> 2;
1092     Uint32 *dstp = (Uint32 *)info->d_pixels;
1093     int dstskip = info->d_skip >> 2;
1094     SDL_PixelFormat *srcfmt = info->src;
1095     SDL_PixelFormat *dstfmt = info->dst;
1096     vector unsigned char mergePermute;
1097     vector unsigned char valphaPermute;
1098     vector unsigned char vsrcPermute;
1099     vector unsigned char vdstPermute;
1100     vector unsigned char vsdstPermute;
1101     vector unsigned char valphamask;
1102     vector unsigned char vpixelmask;
1103     vector unsigned char v0;
1104     vector unsigned short v1;
1105     vector unsigned short v8;
1106
1107     v0 = vec_splat_u8(0);
1108     v1 = vec_splat_u16(1);
1109     v8 = vec_splat_u16(8);
1110     mergePermute = VEC_MERGE_PERMUTE();
1111     valphamask = VEC_ALPHA_MASK();
1112     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1113     vpixelmask = vec_nor(valphamask, v0);
1114     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1115     vdstPermute = calc_swizzle32(NULL, dstfmt);
1116     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1117
1118         while ( height-- ) {
1119         width = info->d_width;
1120 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1121             Uint32 Pixel; \
1122             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1123             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1124             if(sA) { \
1125               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1126               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1127               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1128             } \
1129             ++srcp; \
1130             ++dstp; \
1131             widthvar--; \
1132         }
1133         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1134         if (width > 0) {
1135             /* vsrcPermute */
1136             /* vdstPermute */
1137             int extrawidth = (width % 4);
1138             vector unsigned char valigner = VEC_ALIGNER(srcp);
1139             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1140             width -= extrawidth;
1141             while (width) {
1142                 vector unsigned char voverflow;
1143                 vector unsigned char vd;
1144                 vector unsigned char valpha;
1145                 vector unsigned char vdstalpha;
1146                 /* s = *srcp */
1147                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1148                 vs = vec_perm(vs, voverflow, valigner);
1149                 vs = vec_perm(vs, v0, vsrcPermute);
1150
1151                 valpha = vec_perm(vs, v0, valphaPermute);
1152                 
1153                 /* d = *dstp */
1154                 vd = (vector unsigned char)vec_ld(0, dstp);
1155                 vd = vec_perm(vd, v0, vsdstPermute);
1156                 vdstalpha = vec_and(vd, valphamask);
1157
1158                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1159
1160                 /* set the alpha to the dest alpha */
1161                 vd = vec_and(vd, vpixelmask);
1162                 vd = vec_or(vd, vdstalpha);
1163                 vd = vec_perm(vd, v0, vdstPermute);
1164
1165                 /* *dstp = res */
1166                 vec_st((vector unsigned int)vd, 0, dstp);
1167                 
1168                 srcp += 4;
1169                 dstp += 4;
1170                 width -= 4;
1171                 vs = voverflow;
1172
1173             }
1174             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1175         }
1176             srcp += srcskip;
1177             dstp += dstskip;
1178 #undef ONE_PIXEL_BLEND
1179         }
1180 }
1181
1182 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1183 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1184 {
1185         int width = info->d_width;
1186         int height = info->d_height;
1187         Uint32 *srcp = (Uint32 *)info->s_pixels;
1188         int srcskip = info->s_skip >> 2;
1189         Uint32 *dstp = (Uint32 *)info->d_pixels;
1190         int dstskip = info->d_skip >> 2;
1191     vector unsigned char mergePermute;
1192     vector unsigned char valphaPermute;
1193     vector unsigned char valphamask;
1194     vector unsigned char vpixelmask;
1195     vector unsigned char v0;
1196     vector unsigned short v1;
1197     vector unsigned short v8;
1198     v0 = vec_splat_u8(0);
1199     v1 = vec_splat_u16(1);
1200     v8 = vec_splat_u16(8);
1201     mergePermute = VEC_MERGE_PERMUTE();
1202     valphamask = VEC_ALPHA_MASK();
1203     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1204     
1205  
1206     vpixelmask = vec_nor(valphamask, v0);
1207         while(height--) {
1208         width = info->d_width;
1209 #define ONE_PIXEL_BLEND(condition, widthvar) \
1210         while ((condition)) { \
1211             Uint32 dalpha; \
1212             Uint32 d; \
1213             Uint32 s1; \
1214             Uint32 d1; \
1215             Uint32 s = *srcp; \
1216             Uint32 alpha = s >> 24; \
1217             if(alpha) { \
1218               if(alpha == SDL_ALPHA_OPAQUE) { \
1219                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1220               } else { \
1221                 d = *dstp; \
1222                 dalpha = d & 0xff000000; \
1223                 s1 = s & 0xff00ff; \
1224                 d1 = d & 0xff00ff; \
1225                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1226                 s &= 0xff00; \
1227                 d &= 0xff00; \
1228                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1229                 *dstp = d1 | d | dalpha; \
1230               } \
1231             } \
1232             ++srcp; \
1233             ++dstp; \
1234             widthvar--; \
1235             }
1236         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1237         if (width > 0) {
1238             int extrawidth = (width % 4);
1239             vector unsigned char valigner = VEC_ALIGNER(srcp);
1240             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1241             width -= extrawidth;
1242             while (width) {
1243                 vector unsigned char voverflow;
1244                 vector unsigned char vd;
1245                 vector unsigned char valpha;
1246                 vector unsigned char vdstalpha;
1247                 /* s = *srcp */
1248                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1249                 vs = vec_perm(vs, voverflow, valigner);
1250
1251                 valpha = vec_perm(vs, v0, valphaPermute);
1252                 
1253                 /* d = *dstp */
1254                 vd = (vector unsigned char)vec_ld(0, dstp);
1255                 vdstalpha = vec_and(vd, valphamask);
1256
1257                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1258
1259                 /* set the alpha to the dest alpha */
1260                 vd = vec_and(vd, vpixelmask);
1261                 vd = vec_or(vd, vdstalpha);
1262
1263                 /* *dstp = res */
1264                 vec_st((vector unsigned int)vd, 0, dstp);
1265                 
1266                 srcp += 4;
1267                 dstp += 4;
1268                 width -= 4;
1269                 vs = voverflow;
1270             }
1271             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1272         }
1273             srcp += srcskip;
1274             dstp += dstskip;
1275         }
1276 #undef ONE_PIXEL_BLEND
1277 }
1278
1279 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1280 {
1281     /* XXX : 6 */
1282         unsigned alpha = info->src->alpha;
1283     int height = info->d_height;
1284     Uint32 *srcp = (Uint32 *)info->s_pixels;
1285     int srcskip = info->s_skip >> 2;
1286     Uint32 *dstp = (Uint32 *)info->d_pixels;
1287     int dstskip = info->d_skip >> 2;
1288     SDL_PixelFormat *srcfmt = info->src;
1289     SDL_PixelFormat *dstfmt = info->dst;
1290         unsigned sA = srcfmt->alpha;
1291         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1292     vector unsigned char mergePermute;
1293     vector unsigned char vsrcPermute;
1294     vector unsigned char vdstPermute;
1295     vector unsigned char vsdstPermute;
1296     vector unsigned char valpha;
1297     vector unsigned char valphamask;
1298     vector unsigned char vbits;
1299     vector unsigned short v1;
1300     vector unsigned short v8;
1301
1302     mergePermute = VEC_MERGE_PERMUTE();
1303     v1 = vec_splat_u16(1);
1304     v8 = vec_splat_u16(8);
1305
1306     /* set the alpha to 255 on the destination surf */
1307     valphamask = VEC_ALPHA_MASK();
1308
1309     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1310     vdstPermute = calc_swizzle32(NULL, dstfmt);
1311     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1312
1313     /* set a vector full of alpha and 255-alpha */
1314     ((unsigned char *)&valpha)[0] = alpha;
1315     valpha = vec_splat(valpha, 0);
1316     vbits = (vector unsigned char)vec_splat_s8(-1);
1317
1318     while(height--) {
1319         int width = info->d_width;
1320 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1321             Uint32 Pixel; \
1322             unsigned sR, sG, sB, dR, dG, dB; \
1323             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1324             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1325             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1326             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1327             ++srcp; \
1328             ++dstp; \
1329             widthvar--; \
1330         }
1331         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1332         if (width > 0) {
1333             int extrawidth = (width % 4);
1334             vector unsigned char valigner = VEC_ALIGNER(srcp);
1335             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1336             width -= extrawidth;
1337             while (width) {
1338                 vector unsigned char voverflow;
1339                 vector unsigned char vd;
1340
1341                 /* s = *srcp */
1342                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1343                 vs = vec_perm(vs, voverflow, valigner);
1344                 vs = vec_perm(vs, valpha, vsrcPermute);
1345                 
1346                 /* d = *dstp */
1347                 vd = (vector unsigned char)vec_ld(0, dstp);
1348                 vd = vec_perm(vd, vd, vsdstPermute);
1349
1350                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1351
1352                 /* set the alpha channel to full on */
1353                 vd = vec_or(vd, valphamask);
1354                 vd = vec_perm(vd, vbits, vdstPermute);
1355
1356                 /* *dstp = res */
1357                 vec_st((vector unsigned int)vd, 0, dstp);
1358                 
1359                 srcp += 4;
1360                 dstp += 4;
1361                 width -= 4;
1362                 vs = voverflow;
1363             }
1364             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1365         }
1366 #undef ONE_PIXEL_BLEND
1367  
1368         srcp += srcskip;
1369         dstp += dstskip;
1370     }
1371
1372 }
1373
1374
1375 /* fast RGB888->(A)RGB888 blending */
1376 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1377 {
1378         unsigned alpha = info->src->alpha;
1379     int height = info->d_height;
1380     Uint32 *srcp = (Uint32 *)info->s_pixels;
1381     int srcskip = info->s_skip >> 2;
1382     Uint32 *dstp = (Uint32 *)info->d_pixels;
1383     int dstskip = info->d_skip >> 2;
1384     vector unsigned char mergePermute;
1385     vector unsigned char valpha;
1386     vector unsigned char valphamask;
1387     vector unsigned short v1;
1388     vector unsigned short v8;
1389
1390     mergePermute = VEC_MERGE_PERMUTE();
1391     v1 = vec_splat_u16(1);
1392     v8 = vec_splat_u16(8);
1393
1394     /* set the alpha to 255 on the destination surf */
1395     valphamask = VEC_ALPHA_MASK();
1396
1397     /* set a vector full of alpha and 255-alpha */
1398     ((unsigned char *)&valpha)[0] = alpha;
1399     valpha = vec_splat(valpha, 0);
1400
1401     while(height--) {
1402         int width = info->d_width;
1403 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1404             Uint32 s = *srcp; \
1405             Uint32 d = *dstp; \
1406             Uint32 s1 = s & 0xff00ff; \
1407             Uint32 d1 = d & 0xff00ff; \
1408             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1409                  & 0xff00ff; \
1410             s &= 0xff00; \
1411             d &= 0xff00; \
1412             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1413             *dstp = d1 | d | 0xff000000; \
1414             ++srcp; \
1415             ++dstp; \
1416             widthvar--; \
1417         }
1418         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1419         if (width > 0) {
1420             int extrawidth = (width % 4);
1421             vector unsigned char valigner = VEC_ALIGNER(srcp);
1422             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1423             width -= extrawidth;
1424             while (width) {
1425                 vector unsigned char voverflow;
1426                 vector unsigned char vd;
1427
1428                 /* s = *srcp */
1429                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1430                 vs = vec_perm(vs, voverflow, valigner);
1431                 
1432                 /* d = *dstp */
1433                 vd = (vector unsigned char)vec_ld(0, dstp);
1434
1435                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1436
1437                 /* set the alpha channel to full on */
1438                 vd = vec_or(vd, valphamask);
1439
1440                 /* *dstp = res */
1441                 vec_st((vector unsigned int)vd, 0, dstp);
1442                 
1443                 srcp += 4;
1444                 dstp += 4;
1445                 width -= 4;
1446                 vs = voverflow;
1447             }
1448             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1449         }
1450 #undef ONE_PIXEL_BLEND
1451  
1452         srcp += srcskip;
1453         dstp += dstskip;
1454     }
1455 }
1456 #if __MWERKS__
1457 #pragma altivec_model off
1458 #endif
1459 #endif /* SDL_ALTIVEC_BLITTERS */
1460
1461 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1462 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1463 {
1464         int width = info->d_width;
1465         int height = info->d_height;
1466         Uint32 *srcp = (Uint32 *)info->s_pixels;
1467         int srcskip = info->s_skip >> 2;
1468         Uint32 *dstp = (Uint32 *)info->d_pixels;
1469         int dstskip = info->d_skip >> 2;
1470
1471         while(height--) {
1472             DUFFS_LOOP4({
1473                     Uint32 s = *srcp++;
1474                     Uint32 d = *dstp;
1475                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1476                                + (s & d & 0x00010101)) | 0xff000000;
1477             }, width);
1478             srcp += srcskip;
1479             dstp += dstskip;
1480         }
1481 }
1482
1483 /* fast RGB888->(A)RGB888 blending with surface alpha */
1484 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1485 {
1486         unsigned alpha = info->src->alpha;
1487         if(alpha == 128) {
1488                 BlitRGBtoRGBSurfaceAlpha128(info);
1489         } else {
1490                 int width = info->d_width;
1491                 int height = info->d_height;
1492                 Uint32 *srcp = (Uint32 *)info->s_pixels;
1493                 int srcskip = info->s_skip >> 2;
1494                 Uint32 *dstp = (Uint32 *)info->d_pixels;
1495                 int dstskip = info->d_skip >> 2;
1496                 Uint32 s;
1497                 Uint32 d;
1498                 Uint32 s1;
1499                 Uint32 d1;
1500
1501                 while(height--) {
1502                         DUFFS_LOOP_DOUBLE2({
1503                                 /* One Pixel Blend */
1504                                 s = *srcp;
1505                                 d = *dstp;
1506                                 s1 = s & 0xff00ff;
1507                                 d1 = d & 0xff00ff;
1508                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1509                                      & 0xff00ff;
1510                                 s &= 0xff00;
1511                                 d &= 0xff00;
1512                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1513                                 *dstp = d1 | d | 0xff000000;
1514                                 ++srcp;
1515                                 ++dstp;
1516                         },{
1517                                 /* Two Pixels Blend */
1518                                 s = *srcp;
1519                                 d = *dstp;
1520                                 s1 = s & 0xff00ff;
1521                                 d1 = d & 0xff00ff;
1522                                 d1 += (s1 - d1) * alpha >> 8;
1523                                 d1 &= 0xff00ff;
1524                                      
1525                                 s = ((s & 0xff00) >> 8) | 
1526                                         ((srcp[1] & 0xff00) << 8);
1527                                 d = ((d & 0xff00) >> 8) |
1528                                         ((dstp[1] & 0xff00) << 8);
1529                                 d += (s - d) * alpha >> 8;
1530                                 d &= 0x00ff00ff;
1531                                 
1532                                 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1533                                 ++srcp;
1534                                 
1535                                 s1 = *srcp;
1536                                 d1 = *dstp;
1537                                 s1 &= 0xff00ff;
1538                                 d1 &= 0xff00ff;
1539                                 d1 += (s1 - d1) * alpha >> 8;
1540                                 d1 &= 0xff00ff;
1541                                 
1542                                 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1543                                 ++srcp;
1544                                 ++dstp;
1545                         }, width);
1546                         srcp += srcskip;
1547                         dstp += dstskip;
1548                 }
1549         }
1550 }
1551
1552 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1553 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1554 {
1555         int width = info->d_width;
1556         int height = info->d_height;
1557         Uint32 *srcp = (Uint32 *)info->s_pixels;
1558         int srcskip = info->s_skip >> 2;
1559         Uint32 *dstp = (Uint32 *)info->d_pixels;
1560         int dstskip = info->d_skip >> 2;
1561
1562         while(height--) {
1563             DUFFS_LOOP4({
1564                 Uint32 dalpha;
1565                 Uint32 d;
1566                 Uint32 s1;
1567                 Uint32 d1;
1568                 Uint32 s = *srcp;
1569                 Uint32 alpha = s >> 24;
1570                 /* FIXME: Here we special-case opaque alpha since the
1571                    compositioning used (>>8 instead of /255) doesn't handle
1572                    it correctly. Also special-case alpha=0 for speed?
1573                    Benchmark this! */
1574                 if(alpha) {   
1575                   if(alpha == SDL_ALPHA_OPAQUE) {
1576                     *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1577                   } else {
1578                     /*
1579                      * take out the middle component (green), and process
1580                      * the other two in parallel. One multiply less.
1581                      */
1582                     d = *dstp;
1583                     dalpha = d & 0xff000000;
1584                     s1 = s & 0xff00ff;
1585                     d1 = d & 0xff00ff;
1586                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1587                     s &= 0xff00;
1588                     d &= 0xff00;
1589                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1590                     *dstp = d1 | d | dalpha;
1591                   }
1592                 }
1593                 ++srcp;
1594                 ++dstp;
1595             }, width);
1596             srcp += srcskip;
1597             dstp += dstskip;
1598         }
1599 }
1600
1601 #if GCC_ASMBLIT
1602 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1603 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1604 {
1605         int width = info->d_width;
1606         int height = info->d_height;
1607         Uint32 *srcp = (Uint32 *)info->s_pixels;
1608         int srcskip = info->s_skip >> 2;
1609         Uint32 *dstp = (Uint32 *)info->d_pixels;
1610         int dstskip = info->d_skip >> 2;
1611         SDL_PixelFormat* sf = info->src;
1612         Uint32 amask = sf->Amask;
1613
1614         __asm__ (
1615         /* make mm6 all zeros. */
1616         "pxor       %%mm6, %%mm6\n"
1617         
1618         /* Make a mask to preserve the alpha. */
1619         "movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1620         "punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1621         "pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1622         "movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1623         "pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1624
1625         /* form channel masks */
1626         "movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1627         "packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1628         "packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1629         "pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1630         
1631         /* get alpha channel shift */
1632         "movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1633
1634           : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1635
1636         while(height--) {
1637
1638             DUFFS_LOOP4({
1639                 Uint32 alpha;
1640
1641                 __asm__ (
1642                 "prefetch 64(%0)\n"
1643                 "prefetch 64(%1)\n"
1644                         : : "r" (srcp), "r" (dstp) );
1645
1646                 alpha = *srcp & amask;
1647                 /* FIXME: Here we special-case opaque alpha since the
1648                    compositioning used (>>8 instead of /255) doesn't handle
1649                    it correctly. Also special-case alpha=0 for speed?
1650                    Benchmark this! */
1651                 if(alpha == 0) {
1652                     /* do nothing */
1653                 }
1654                 else if(alpha == amask) {
1655                         /* opaque alpha -- copy RGB, keep dst alpha */
1656                     /* using MMX here to free up regular registers for other things */
1657                             __asm__ (
1658                     "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1659                     "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1660                     "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1661                     "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1662                     "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1663                     "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1664
1665                      : : "r" (srcp), "r" (dstp) );
1666                 } 
1667
1668                 else {
1669                             __asm__ (
1670                     /* load in the source, and dst. */
1671                     "movd      (%0), %%mm0\n"               /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1672                     "movd      (%1), %%mm1\n"               /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1673
1674                     /* Move the src alpha into mm2 */
1675
1676                     /* if supporting pshufw */
1677                     /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1678                     /*"psrlw     $8, %%mm2\n" */
1679                     
1680                     /* else: */
1681                     "movd       %2,    %%mm2\n"
1682                     "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1683                     "punpcklwd  %%mm2, %%mm2\n"             /* mm2 = 0 0 0 0 |  0 As  0  As */
1684                     "punpckldq  %%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1685                     "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1686
1687                     /* move the colors into words. */
1688                     "punpcklbw %%mm6, %%mm0\n"              /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1689                     "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1690
1691                     /* src - dst */
1692                     "psubw    %%mm1, %%mm0\n"               /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1693
1694                     /* A * (src-dst) */
1695                     "pmullw    %%mm2, %%mm0\n"              /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1696                     "psrlw     $8,    %%mm0\n"              /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1697                     "paddb     %%mm1, %%mm0\n"              /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1698
1699                     "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1700                     
1701                     "movd      %%mm0, (%1)\n"               /* result in mm0 */
1702
1703                      : : "r" (srcp), "r" (dstp), "r" (alpha) );
1704
1705                 }
1706                 ++srcp;
1707                 ++dstp;
1708             }, width);
1709             srcp += srcskip;
1710             dstp += dstskip;
1711         }
1712
1713         __asm__ (
1714         "emms\n"
1715                 :   );
1716 }
1717 /* End GCC_ASMBLIT*/
1718
1719 #elif MSVC_ASMBLIT
1720 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1721 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1722 {
1723         int width = info->d_width;
1724         int height = info->d_height;
1725         Uint32 *srcp = (Uint32 *)info->s_pixels;
1726         int srcskip = info->s_skip >> 2;
1727         Uint32 *dstp = (Uint32 *)info->d_pixels;
1728         int dstskip = info->d_skip >> 2;
1729         SDL_PixelFormat* sf = info->src;
1730         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1731         Uint32 amask = sf->Amask;
1732         Uint32 ashift = sf->Ashift;
1733         Uint64 multmask;
1734         
1735         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1736
1737         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1738         multmask = ~(0xFFFFi64 << (ashift * 2));
1739         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1740
1741         while(height--) {
1742             DUFFS_LOOP4({
1743                 Uint32 alpha;
1744
1745                 _m_prefetch(srcp + 16);
1746                 _m_prefetch(dstp + 16);
1747
1748                 alpha = *srcp & amask;
1749                 if (alpha == 0) {
1750                         /* do nothing */
1751                 } else if (alpha == amask) {
1752                         /* copy RGB, keep dst alpha */
1753                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1754                 } else {
1755                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1756                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1757
1758                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1759                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1760
1761                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1762                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1763                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1764                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1765                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1766
1767                         /* blend */                 
1768                         src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1769                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1770                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1771                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1772                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1773                         
1774                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1775                 }
1776                 ++srcp;
1777                 ++dstp;
1778             }, width);
1779             srcp += srcskip;
1780             dstp += dstskip;
1781         }
1782         _mm_empty();
1783 }
1784 /* End MSVC_ASMBLIT */
1785
1786 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1787
1788 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1789
1790 /* blend a single 16 bit pixel at 50% */
1791 #define BLEND16_50(d, s, mask)                                          \
1792         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1793
1794 /* blend two 16 bit pixels at 50% */
1795 #define BLEND2x16_50(d, s, mask)                                             \
1796         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1797          + (s & d & (~(mask | mask << 16))))
1798
1799 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1800 {
1801         int width = info->d_width;
1802         int height = info->d_height;
1803         Uint16 *srcp = (Uint16 *)info->s_pixels;
1804         int srcskip = info->s_skip >> 1;
1805         Uint16 *dstp = (Uint16 *)info->d_pixels;
1806         int dstskip = info->d_skip >> 1;
1807
1808         while(height--) {
1809                 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1810                         /*
1811                          * Source and destination not aligned, pipeline it.
1812                          * This is mostly a win for big blits but no loss for
1813                          * small ones
1814                          */
1815                         Uint32 prev_sw;
1816                         int w = width;
1817
1818                         /* handle odd destination */
1819                         if((uintptr_t)dstp & 2) {
1820                                 Uint16 d = *dstp, s = *srcp;
1821                                 *dstp = BLEND16_50(d, s, mask);
1822                                 dstp++;
1823                                 srcp++;
1824                                 w--;
1825                         }
1826                         srcp++; /* srcp is now 32-bit aligned */
1827
1828                         /* bootstrap pipeline with first halfword */
1829                         prev_sw = ((Uint32 *)srcp)[-1];
1830
1831                         while(w > 1) {
1832                                 Uint32 sw, dw, s;
1833                                 sw = *(Uint32 *)srcp;
1834                                 dw = *(Uint32 *)dstp;
1835 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1836                                 s = (prev_sw << 16) + (sw >> 16);
1837 #else
1838                                 s = (prev_sw >> 16) + (sw << 16);
1839 #endif
1840                                 prev_sw = sw;
1841                                 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1842                                 dstp += 2;
1843                                 srcp += 2;
1844                                 w -= 2;
1845                         }
1846
1847                         /* final pixel if any */
1848                         if(w) {
1849                                 Uint16 d = *dstp, s;
1850 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1851                                 s = (Uint16)prev_sw;
1852 #else
1853                                 s = (Uint16)(prev_sw >> 16);
1854 #endif
1855                                 *dstp = BLEND16_50(d, s, mask);
1856                                 srcp++;
1857                                 dstp++;
1858                         }
1859                         srcp += srcskip - 1;
1860                         dstp += dstskip;
1861                 } else {
1862                         /* source and destination are aligned */
1863                         int w = width;
1864
1865                         /* first odd pixel? */
1866                         if((uintptr_t)srcp & 2) {
1867                                 Uint16 d = *dstp, s = *srcp;
1868                                 *dstp = BLEND16_50(d, s, mask);
1869                                 srcp++;
1870                                 dstp++;
1871                                 w--;
1872                         }
1873                         /* srcp and dstp are now 32-bit aligned */
1874
1875                         while(w > 1) {
1876                                 Uint32 sw = *(Uint32 *)srcp;
1877                                 Uint32 dw = *(Uint32 *)dstp;
1878                                 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1879                                 srcp += 2;
1880                                 dstp += 2;
1881                                 w -= 2;
1882                         }
1883
1884                         /* last odd pixel? */
1885                         if(w) {
1886                                 Uint16 d = *dstp, s = *srcp;
1887                                 *dstp = BLEND16_50(d, s, mask);
1888                                 srcp++;
1889                                 dstp++;
1890                         }
1891                         srcp += srcskip;
1892                         dstp += dstskip;
1893                 }
1894         }
1895 }
1896
1897 #if GCC_ASMBLIT
1898 /* fast RGB565->RGB565 blending with surface alpha */
1899 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1900 {
1901         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1902         if(alpha == 128) {
1903                 Blit16to16SurfaceAlpha128(info, 0xf7de);
1904         } else {
1905                 int width = info->d_width;
1906                 int height = info->d_height;
1907                 Uint16 *srcp = (Uint16 *)info->s_pixels;
1908                 int srcskip = info->s_skip >> 1;
1909                 Uint16 *dstp = (Uint16 *)info->d_pixels;
1910                 int dstskip = info->d_skip >> 1;
1911                 Uint32 s, d;
1912                 Uint64 load;
1913           
1914                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
1915                 load = alpha;
1916                 alpha >>= 3;            /* downscale alpha to 5 bits */
1917
1918                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1919                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1920                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1921                 /* position alpha to allow for mullo and mulhi on diff channels
1922                    to reduce the number of operations */
1923                 psllq_i2r(3, mm0);
1924           
1925                 /* Setup the 565 color channel masks */
1926                 load = 0x07E007E007E007E0ULL;
1927                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1928                 load = 0x001F001F001F001FULL;
1929                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1930                 while(height--) {
1931                         DUFFS_LOOP_QUATRO2(
1932                         {
1933                                 s = *srcp++;
1934                                 d = *dstp;
1935                                 /*
1936                                  * shift out the middle component (green) to
1937                                  * the high 16 bits, and process all three RGB
1938                                  * components at the same time.
1939                                  */
1940                                 s = (s | s << 16) & 0x07e0f81f;
1941                                 d = (d | d << 16) & 0x07e0f81f;
1942                                 d += (s - d) * alpha >> 5;
1943                                 d &= 0x07e0f81f;
1944                                 *dstp++ = d | d >> 16;
1945                         },{
1946                                 s = *srcp++;
1947                                 d = *dstp;
1948                                 /*
1949                                  * shift out the middle component (green) to
1950                                  * the high 16 bits, and process all three RGB
1951                                  * components at the same time.
1952                                  */
1953                                 s = (s | s << 16) & 0x07e0f81f;
1954                                 d = (d | d << 16) & 0x07e0f81f;
1955                                 d += (s - d) * alpha >> 5;
1956                                 d &= 0x07e0f81f;
1957                                 *dstp++ = d | d >> 16;
1958                                 s = *srcp++;
1959                                 d = *dstp;
1960                                 /*
1961                                  * shift out the middle component (green) to
1962                                  * the high 16 bits, and process all three RGB
1963                                  * components at the same time.
1964                                  */
1965                                 s = (s | s << 16) & 0x07e0f81f;
1966                                 d = (d | d << 16) & 0x07e0f81f;
1967                                 d += (s - d) * alpha >> 5;
1968                                 d &= 0x07e0f81f;
1969                                 *dstp++ = d | d >> 16;
1970                         },{
1971                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1972                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1973
1974                                 /* red -- does not need a mask since the right shift clears
1975                                    the uninteresting bits */
1976                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1977                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1978                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1979                                 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1980
1981                                 /* blend */
1982                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1983                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1984                                 /* alpha used is actually 11 bits
1985                                    11 + 5 = 16 bits, so the sign bits are lost */
1986                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1987                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1988                                 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1989
1990                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
1991
1992                                 /* green -- process the bits in place */
1993                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1994                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1995                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1996                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1997
1998                                 /* blend */
1999                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2000                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2001                                 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2002                                    bits are gone and the sign bits present */
2003                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2004                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2005
2006                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2007
2008                                 /* blue */
2009                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2010                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2011                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2012                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2013
2014                                 /* blend */
2015                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2016                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2017                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2018                                    the interesting bits will need to be MASKed */
2019                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2020                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2021                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2022
2023                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2024
2025                                 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2026
2027                                 srcp += 4;
2028                                 dstp += 4;
2029                         }, width);                      
2030                         srcp += srcskip;
2031                         dstp += dstskip;
2032                 }
2033                 emms();
2034         }
2035 }
2036
2037 /* fast RGB555->RGB555 blending with surface alpha */
2038 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2039 {
2040         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2041         if(alpha == 128) {
2042                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2043         } else {
2044                 int width = info->d_width;
2045                 int height = info->d_height;
2046                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2047                 int srcskip = info->s_skip >> 1;
2048                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2049                 int dstskip = info->d_skip >> 1;
2050                 Uint32 s, d;
2051                 Uint64 load;
2052           
2053                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2054                 load = alpha;
2055                 alpha >>= 3;            /* downscale alpha to 5 bits */
2056
2057                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2058                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2059                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2060                 /* position alpha to allow for mullo and mulhi on diff channels
2061                    to reduce the number of operations */
2062                 psllq_i2r(3, mm0);
2063
2064                 /* Setup the 555 color channel masks */
2065                 load = 0x03E003E003E003E0ULL;
2066                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2067                 load = 0x001F001F001F001FULL;
2068                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2069                 while(height--) {
2070                         DUFFS_LOOP_QUATRO2(
2071                         {
2072                                 s = *srcp++;
2073                                 d = *dstp;
2074                                 /*
2075                                  * shift out the middle component (green) to
2076                                  * the high 16 bits, and process all three RGB
2077                                  * components at the same time.
2078                                  */
2079                                 s = (s | s << 16) & 0x03e07c1f;
2080                                 d = (d | d << 16) & 0x03e07c1f;
2081                                 d += (s - d) * alpha >> 5;
2082                                 d &= 0x03e07c1f;
2083                                 *dstp++ = d | d >> 16;
2084                         },{
2085                                 s = *srcp++;
2086                                 d = *dstp;
2087                                 /*
2088                                  * shift out the middle component (green) to
2089                                  * the high 16 bits, and process all three RGB
2090                                  * components at the same time.
2091                                  */
2092                                 s = (s | s << 16) & 0x03e07c1f;
2093                                 d = (d | d << 16) & 0x03e07c1f;
2094                                 d += (s - d) * alpha >> 5;
2095                                 d &= 0x03e07c1f;
2096                                 *dstp++ = d | d >> 16;
2097                                 s = *srcp++;
2098                                 d = *dstp;
2099                                 /*
2100                                  * shift out the middle component (green) to
2101                                  * the high 16 bits, and process all three RGB
2102                                  * components at the same time.
2103                                  */
2104                                 s = (s | s << 16) & 0x03e07c1f;
2105                                 d = (d | d << 16) & 0x03e07c1f;
2106                                 d += (s - d) * alpha >> 5;
2107                                 d &= 0x03e07c1f;
2108                                 *dstp++ = d | d >> 16;
2109                         },{
2110                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2111                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2112
2113                                 /* red -- process the bits in place */
2114                                 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2115                                         /* by reusing the GREEN mask we free up another mmx
2116                                            register to accumulate the result */
2117
2118                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2119                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2120                                 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2121                                 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2122
2123                                 /* blend */
2124                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2125                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2126                                 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2127                                    cleared by a MASK below */
2128                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2129                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2130                                 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2131
2132                                 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2133
2134                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2135
2136                                 /* green -- process the bits in place */
2137                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2138                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2139                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2140                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2141
2142                                 /* blend */
2143                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2144                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2145                                 /* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2146                                    bits are gone and the sign bits present */
2147                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2148                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2149
2150                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2151
2152                                 /* blue */
2153                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2154                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2155                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2156                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2157
2158                                 /* blend */
2159                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2160                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2161                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2162                                    the interesting bits will need to be MASKed */
2163                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2164                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2165                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2166
2167                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2168
2169                                 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2170
2171                                 srcp += 4;
2172                                 dstp += 4;
2173                         }, width);                      
2174                         srcp += srcskip;
2175                         dstp += dstskip;
2176                 }
2177                 emms();
2178         }
2179 }
2180 /* End GCC_ASMBLIT */
2181
2182 #elif MSVC_ASMBLIT
2183 /* fast RGB565->RGB565 blending with surface alpha */
2184 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2185 {
2186         unsigned alpha = info->src->alpha;
2187         if(alpha == 128) {
2188                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2189         } else {
2190                 int width = info->d_width;
2191                 int height = info->d_height;
2192                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2193                 int srcskip = info->s_skip >> 1;
2194                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2195                 int dstskip = info->d_skip >> 1;
2196                 Uint32 s, d;
2197           
2198                 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2199
2200                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2201                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2202                 alpha >>= 3;            /* downscale alpha to 5 bits */
2203
2204                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2205                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2206                 /* position alpha to allow for mullo and mulhi on diff channels
2207                    to reduce the number of operations */
2208                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2209           
2210                 /* Setup the 565 color channel masks */
2211                 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2212                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2213                 
2214                 while(height--) {
2215                         DUFFS_LOOP_QUATRO2(
2216                         {
2217                                 s = *srcp++;
2218                                 d = *dstp;
2219                                 /*
2220                                  * shift out the middle component (green) to
2221                                  * the high 16 bits, and process all three RGB
2222                                  * components at the same time.
2223                                  */
2224                                 s = (s | s << 16) & 0x07e0f81f;
2225                                 d = (d | d << 16) & 0x07e0f81f;
2226                                 d += (s - d) * alpha >> 5;
2227                                 d &= 0x07e0f81f;
2228                                 *dstp++ = (Uint16)(d | d >> 16);
2229                         },{
2230                                 s = *srcp++;
2231                                 d = *dstp;
2232                                 /*
2233                                  * shift out the middle component (green) to
2234                                  * the high 16 bits, and process all three RGB
2235                                  * components at the same time.
2236                                  */
2237                                 s = (s | s << 16) & 0x07e0f81f;
2238                                 d = (d | d << 16) & 0x07e0f81f;
2239                                 d += (s - d) * alpha >> 5;
2240                                 d &= 0x07e0f81f;
2241                                 *dstp++ = (Uint16)(d | d >> 16);
2242                                 s = *srcp++;
2243                                 d = *dstp;
2244                                 /*
2245                                  * shift out the middle component (green) to
2246                                  * the high 16 bits, and process all three RGB
2247                                  * components at the same time.
2248                                  */
2249                                 s = (s | s << 16) & 0x07e0f81f;
2250                                 d = (d | d << 16) & 0x07e0f81f;
2251                                 d += (s - d) * alpha >> 5;
2252                                 d &= 0x07e0f81f;
2253                                 *dstp++ = (Uint16)(d | d >> 16);
2254                         },{
2255                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2256                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2257
2258                                 /* red */
2259                                 src2 = src1;
2260                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2261
2262                                 dst2 = dst1;
2263                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2264
2265                                 /* blend */
2266                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2267                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2268                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2269                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2270                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2271
2272                                 mm_res = dst2; /* RED -> mm_res */
2273
2274                                 /* green -- process the bits in place */
2275                                 src2 = src1;
2276                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2277
2278                                 dst2 = dst1;
2279                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2280
2281                                 /* blend */
2282                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2283                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2284                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2285                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2286
2287                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2288
2289                                 /* blue */
2290                                 src2 = src1;
2291                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2292
2293                                 dst2 = dst1;
2294                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2295
2296                                 /* blend */
2297                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2298                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2299                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2300                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2301                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2302
2303                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2304
2305                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2306
2307                                 srcp += 4;
2308                                 dstp += 4;
2309                         }, width);                      
2310                         srcp += srcskip;
2311                         dstp += dstskip;
2312                 }
2313                 _mm_empty();
2314         }
2315 }
2316
2317 /* fast RGB555->RGB555 blending with surface alpha */
2318 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2319 {
2320         unsigned alpha = info->src->alpha;
2321         if(alpha == 128) {
2322                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2323         } else {
2324                 int width = info->d_width;
2325                 int height = info->d_height;
2326                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2327                 int srcskip = info->s_skip >> 1;
2328                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2329                 int dstskip = info->d_skip >> 1;
2330                 Uint32 s, d;
2331           
2332                 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2333
2334                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2335                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2336                 alpha >>= 3;            /* downscale alpha to 5 bits */
2337
2338                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2339                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2340                 /* position alpha to allow for mullo and mulhi on diff channels
2341                    to reduce the number of operations */
2342                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2343           
2344                 /* Setup the 555 color channel masks */
2345                 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2346                 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2347                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2348
2349                 while(height--) {
2350                         DUFFS_LOOP_QUATRO2(
2351                         {
2352                                 s = *srcp++;
2353                                 d = *dstp;
2354                                 /*
2355                                  * shift out the middle component (green) to
2356                                  * the high 16 bits, and process all three RGB
2357                                  * components at the same time.
2358                                  */
2359                                 s = (s | s << 16) & 0x03e07c1f;
2360                                 d = (d | d << 16) & 0x03e07c1f;
2361                                 d += (s - d) * alpha >> 5;
2362                                 d &= 0x03e07c1f;
2363                                 *dstp++ = (Uint16)(d | d >> 16);
2364                         },{
2365                                 s = *srcp++;
2366                                 d = *dstp;
2367                                 /*
2368                                  * shift out the middle component (green) to
2369                                  * the high 16 bits, and process all three RGB
2370                                  * components at the same time.
2371                                  */
2372                                 s = (s | s << 16) & 0x03e07c1f;
2373                                 d = (d | d << 16) & 0x03e07c1f;
2374                                 d += (s - d) * alpha >> 5;
2375                                 d &= 0x03e07c1f;
2376                                 *dstp++ = (Uint16)(d | d >> 16);
2377                                 s = *srcp++;
2378                                 d = *dstp;
2379                                 /*
2380                                  * shift out the middle component (green) to
2381                                  * the high 16 bits, and process all three RGB
2382                                  * components at the same time.
2383                                  */
2384                                 s = (s | s << 16) & 0x03e07c1f;
2385                                 d = (d | d << 16) & 0x03e07c1f;
2386                                 d += (s - d) * alpha >> 5;
2387                                 d &= 0x03e07c1f;
2388                                 *dstp++ = (Uint16)(d | d >> 16);
2389                         },{
2390                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2391                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2392
2393                                 /* red -- process the bits in place */
2394                                 src2 = src1;
2395                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2396
2397                                 dst2 = dst1;
2398                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2399
2400                                 /* blend */
2401                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2402                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2403                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2404                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2405                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2406
2407                                 mm_res = dst2; /* RED -> mm_res */
2408                                 
2409                                 /* green -- process the bits in place */
2410                                 src2 = src1;
2411                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2412
2413                                 dst2 = dst1;
2414                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2415
2416                                 /* blend */
2417                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2418                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2419                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2420                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2421
2422                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2423
2424                                 /* blue */
2425                                 src2 = src1; /* src -> src2 */
2426                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2427
2428                                 dst2 = dst1; /* dst -> dst2 */
2429                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2430
2431                                 /* blend */
2432                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2433                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2434                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2435                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2436                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2437
2438                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2439
2440                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2441
2442                                 srcp += 4;
2443                                 dstp += 4;
2444                         }, width);                      
2445                         srcp += srcskip;
2446                         dstp += dstskip;
2447                 }
2448                 _mm_empty();
2449         }
2450 }
2451 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2452
2453 /* fast RGB565->RGB565 blending with surface alpha */
2454 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2455 {
2456         unsigned alpha = info->src->alpha;
2457         if(alpha == 128) {
2458                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2459         } else {
2460                 int width = info->d_width;
2461                 int height = info->d_height;
2462                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2463                 int srcskip = info->s_skip >> 1;
2464                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2465                 int dstskip = info->d_skip >> 1;
2466                 alpha >>= 3;    /* downscale alpha to 5 bits */
2467
2468                 while(height--) {
2469                         DUFFS_LOOP4({
2470                                 Uint32 s = *srcp++;
2471                                 Uint32 d = *dstp;
2472                                 /*
2473                                  * shift out the middle component (green) to
2474                                  * the high 16 bits, and process all three RGB
2475                                  * components at the same time.
2476                                  */
2477                                 s = (s | s << 16) & 0x07e0f81f;
2478                                 d = (d | d << 16) & 0x07e0f81f;
2479                                 d += (s - d) * alpha >> 5;
2480                                 d &= 0x07e0f81f;
2481                                 *dstp++ = (Uint16)(d | d >> 16);
2482                         }, width);
2483                         srcp += srcskip;
2484                         dstp += dstskip;
2485                 }
2486         }
2487 }
2488
2489 /* fast RGB555->RGB555 blending with surface alpha */
2490 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2491 {
2492         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2493         if(alpha == 128) {
2494                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2495         } else {
2496                 int width = info->d_width;
2497                 int height = info->d_height;
2498                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2499                 int srcskip = info->s_skip >> 1;
2500                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2501                 int dstskip = info->d_skip >> 1;
2502                 alpha >>= 3;            /* downscale alpha to 5 bits */
2503
2504                 while(height--) {
2505                         DUFFS_LOOP4({
2506                                 Uint32 s = *srcp++;
2507                                 Uint32 d = *dstp;
2508                                 /*
2509                                  * shift out the middle component (green) to
2510                                  * the high 16 bits, and process all three RGB
2511                                  * components at the same time.
2512                                  */
2513                                 s = (s | s << 16) & 0x03e07c1f;
2514                                 d = (d | d << 16) & 0x03e07c1f;
2515                                 d += (s - d) * alpha >> 5;
2516                                 d &= 0x03e07c1f;
2517                                 *dstp++ = (Uint16)(d | d >> 16);
2518                         }, width);
2519                         srcp += srcskip;
2520                         dstp += dstskip;
2521                 }
2522         }
2523 }
2524
2525 /* fast ARGB8888->RGB565 blending with pixel alpha */
2526 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2527 {
2528         int width = info->d_width;
2529         int height = info->d_height;
2530         Uint32 *srcp = (Uint32 *)info->s_pixels;
2531         int srcskip = info->s_skip >> 2;
2532         Uint16 *dstp = (Uint16 *)info->d_pixels;
2533         int dstskip = info->d_skip >> 1;
2534
2535         while(height--) {
2536             DUFFS_LOOP4({
2537                 Uint32 s = *srcp;
2538                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2539                 /* FIXME: Here we special-case opaque alpha since the
2540                    compositioning used (>>8 instead of /255) doesn't handle
2541                    it correctly. Also special-case alpha=0 for speed?
2542                    Benchmark this! */
2543                 if(alpha) {   
2544                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2545                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2546                   } else {
2547                     Uint32 d = *dstp;
2548                     /*
2549                      * convert source and destination to G0RAB65565
2550                      * and blend all components at the same time
2551                      */
2552                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2553                       + (s >> 3 & 0x1f);
2554                     d = (d | d << 16) & 0x07e0f81f;
2555                     d += (s - d) * alpha >> 5;
2556                     d &= 0x07e0f81f;
2557                     *dstp = (Uint16)(d | d >> 16);
2558                   }
2559                 }
2560                 srcp++;
2561                 dstp++;
2562             }, width);
2563             srcp += srcskip;
2564             dstp += dstskip;
2565         }
2566 }
2567
2568 /* fast ARGB8888->RGB555 blending with pixel alpha */
2569 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2570 {
2571         int width = info->d_width;
2572         int height = info->d_height;
2573         Uint32 *srcp = (Uint32 *)info->s_pixels;
2574         int srcskip = info->s_skip >> 2;
2575         Uint16 *dstp = (Uint16 *)info->d_pixels;
2576         int dstskip = info->d_skip >> 1;
2577
2578         while(height--) {
2579             DUFFS_LOOP4({
2580                 unsigned alpha;
2581                 Uint32 s = *srcp;
2582                 alpha = s >> 27; /* downscale alpha to 5 bits */
2583                 /* FIXME: Here we special-case opaque alpha since the
2584                    compositioning used (>>8 instead of /255) doesn't handle
2585                    it correctly. Also special-case alpha=0 for speed?
2586                    Benchmark this! */
2587                 if(alpha) {   
2588                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2589                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2590                   } else {
2591                     Uint32 d = *dstp;
2592                     /*
2593                      * convert source and destination to G0RAB65565
2594                      * and blend all components at the same time
2595                      */
2596                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2597                       + (s >> 3 & 0x1f);
2598                     d = (d | d << 16) & 0x03e07c1f;
2599                     d += (s - d) * alpha >> 5;
2600                     d &= 0x03e07c1f;
2601                     *dstp = (Uint16)(d | d >> 16);
2602                   }
2603                 }
2604                 srcp++;
2605                 dstp++;
2606             }, width);
2607             srcp += srcskip;
2608             dstp += dstskip;
2609         }
2610 }
2611
2612 /* General (slow) N->N blending with per-surface alpha */
2613 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2614 {
2615         int width = info->d_width;
2616         int height = info->d_height;
2617         Uint8 *src = info->s_pixels;
2618         int srcskip = info->s_skip;
2619         Uint8 *dst = info->d_pixels;
2620         int dstskip = info->d_skip;
2621         SDL_PixelFormat *srcfmt = info->src;
2622         SDL_PixelFormat *dstfmt = info->dst;
2623         int srcbpp = srcfmt->BytesPerPixel;
2624         int dstbpp = dstfmt->BytesPerPixel;
2625         unsigned sA = srcfmt->alpha;
2626         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2627
2628         if(sA) {
2629           while ( height-- ) {
2630             DUFFS_LOOP4(
2631             {
2632                 Uint32 Pixel;
2633                 unsigned sR;
2634                 unsigned sG;
2635                 unsigned sB;
2636                 unsigned dR;
2637                 unsigned dG;
2638                 unsigned dB;
2639                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2640                 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2641                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2642                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2643                 src += srcbpp;
2644                 dst += dstbpp;
2645             },
2646             width);
2647             src += srcskip;
2648             dst += dstskip;
2649           }
2650         }
2651 }
2652
2653 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2654 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2655 {
2656         int width = info->d_width;
2657         int height = info->d_height;
2658         Uint8 *src = info->s_pixels;
2659         int srcskip = info->s_skip;
2660         Uint8 *dst = info->d_pixels;
2661         int dstskip = info->d_skip;
2662         SDL_PixelFormat *srcfmt = info->src;
2663         SDL_PixelFormat *dstfmt = info->dst;
2664         Uint32 ckey = srcfmt->colorkey;
2665         int srcbpp = srcfmt->BytesPerPixel;
2666         int dstbpp = dstfmt->BytesPerPixel;
2667         unsigned sA = srcfmt->alpha;
2668         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2669
2670         if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2671             Uint16 *src16 = (Uint16 *)src;
2672             Uint16 *dst16 = (Uint16 *)dst;
2673             sA >>= 3;   /* downscale alpha to 5 bits */
2674             while ( height-- ) {
2675                 DUFFS_LOOP4(
2676                 {
2677                     Uint32 s;
2678                     Uint32 d;
2679                     s = *src16;
2680                     if(sA && s != ckey) {
2681                         d = *dst16;
2682                         s = (s | s << 16) & 0x07e0f81f;
2683                         d = (d | d << 16) & 0x07e0f81f;
2684                         d += (s - d) * sA >> 5;
2685                         d &= 0x07e0f81f;
2686                         *dst16 = (Uint16)(d | d >> 16);
2687                     }
2688                     src16++;
2689                     dst16++;
2690                 },
2691                 width);
2692                 src16 += srcskip / 2;
2693                 dst16 += dstskip / 2;
2694             }
2695             return;
2696         }
2697
2698         while ( height-- ) {
2699             DUFFS_LOOP4(
2700             {
2701                 Uint32 Pixel;
2702                 unsigned sR;
2703                 unsigned sG;
2704                 unsigned sB;
2705                 unsigned dR;
2706                 unsigned dG;
2707                 unsigned dB;
2708                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2709                 if(sA && Pixel != ckey) {
2710                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2711                     DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2712                     ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2713                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2714                 }
2715                 src += srcbpp;
2716                 dst += dstbpp;
2717             },
2718             width);
2719             src += srcskip;
2720             dst += dstskip;
2721         }
2722 }
2723
2724 /* General (slow) N->N blending with pixel alpha */
2725 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2726 {
2727         int width = info->d_width;
2728         int height = info->d_height;
2729         Uint8 *src = info->s_pixels;
2730         int srcskip = info->s_skip;
2731         Uint8 *dst = info->d_pixels;
2732         int dstskip = info->d_skip;
2733         SDL_PixelFormat *srcfmt = info->src;
2734         SDL_PixelFormat *dstfmt = info->dst;
2735
2736         int  srcbpp;
2737         int  dstbpp;
2738
2739         /* Set up some basic variables */
2740         srcbpp = srcfmt->BytesPerPixel;
2741         dstbpp = dstfmt->BytesPerPixel;
2742
2743         /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2744            quite right. for <8bpp source alpha, it gets them very wrong
2745            (check all macros!)
2746            It is unclear whether there is a good general solution that doesn't
2747            need a branch (or a divide). */
2748         while ( height-- ) {
2749             DUFFS_LOOP4(
2750             {
2751                 Uint32 Pixel;
2752                 unsigned sR;
2753                 unsigned sG;
2754                 unsigned sB;
2755                 unsigned dR;
2756                 unsigned dG;
2757                 unsigned dB;
2758                 unsigned sA;
2759                 unsigned dA;
2760                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2761                 if(sA) {
2762                   DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2763                   ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2764                   ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2765                 }
2766                 src += srcbpp;
2767                 dst += dstbpp;
2768             },
2769             width);
2770             src += srcskip;
2771             dst += dstskip;
2772         }
2773 }
2774
2775
2776 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2777 {
2778     SDL_PixelFormat *sf = surface->format;
2779     SDL_PixelFormat *df = surface->map->dst->format;
2780
2781     if(sf->Amask == 0) {
2782         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2783             if(df->BytesPerPixel == 1)
2784                 return BlitNto1SurfaceAlphaKey;
2785             else
2786 #if SDL_ALTIVEC_BLITTERS
2787         if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2788             !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2789             return Blit32to32SurfaceAlphaKeyAltivec;
2790         else
2791 #endif
2792             return BlitNtoNSurfaceAlphaKey;
2793         } else {
2794             /* Per-surface alpha blits */
2795             switch(df->BytesPerPixel) {
2796             case 1:
2797                 return BlitNto1SurfaceAlpha;
2798
2799             case 2:
2800                 if(surface->map->identity) {
2801                     if(df->Gmask == 0x7e0)
2802                     {
2803 #if MMX_ASMBLIT
2804                 if(SDL_HasMMX())
2805                         return Blit565to565SurfaceAlphaMMX;
2806                 else
2807 #endif
2808                         return Blit565to565SurfaceAlpha;
2809                     }
2810                     else if(df->Gmask == 0x3e0)
2811                     {
2812 #if MMX_ASMBLIT
2813                 if(SDL_HasMMX())
2814                         return Blit555to555SurfaceAlphaMMX;
2815                 else
2816 #endif
2817                         return Blit555to555SurfaceAlpha;
2818                     }
2819                 }
2820                 return BlitNtoNSurfaceAlpha;
2821
2822             case 4:
2823                 if(sf->Rmask == df->Rmask
2824                    && sf->Gmask == df->Gmask
2825                    && sf->Bmask == df->Bmask
2826                    && sf->BytesPerPixel == 4)
2827                 {
2828 #if MMX_ASMBLIT
2829                         if(sf->Rshift % 8 == 0
2830                            && sf->Gshift % 8 == 0
2831                            && sf->Bshift % 8 == 0
2832                            && SDL_HasMMX())
2833                             return BlitRGBtoRGBSurfaceAlphaMMX;
2834 #endif
2835                         if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2836                         {
2837 #if SDL_ALTIVEC_BLITTERS
2838                                 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2839                                         && SDL_HasAltiVec())
2840                                         return BlitRGBtoRGBSurfaceAlphaAltivec;
2841 #endif
2842                                 return BlitRGBtoRGBSurfaceAlpha;
2843                         }
2844                 }
2845 #if SDL_ALTIVEC_BLITTERS
2846                 if((sf->BytesPerPixel == 4) &&
2847                    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2848                         return Blit32to32SurfaceAlphaAltivec;
2849                 else
2850 #endif
2851                         return BlitNtoNSurfaceAlpha;
2852
2853             case 3:
2854             default:
2855                 return BlitNtoNSurfaceAlpha;
2856             }
2857         }
2858     } else {
2859         /* Per-pixel alpha blits */
2860         switch(df->BytesPerPixel) {
2861         case 1:
2862             return BlitNto1PixelAlpha;
2863
2864         case 2:
2865 #if SDL_ALTIVEC_BLITTERS
2866         if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2867            df->Gmask == 0x7e0 &&
2868            df->Bmask == 0x1f && SDL_HasAltiVec())
2869             return Blit32to565PixelAlphaAltivec;
2870         else
2871 #endif
2872             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2873                && sf->Gmask == 0xff00
2874                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2875                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2876                 if(df->Gmask == 0x7e0)
2877                     return BlitARGBto565PixelAlpha;
2878                 else if(df->Gmask == 0x3e0)
2879                     return BlitARGBto555PixelAlpha;
2880             }
2881             return BlitNtoNPixelAlpha;
2882
2883         case 4:
2884             if(sf->Rmask == df->Rmask
2885                && sf->Gmask == df->Gmask
2886                && sf->Bmask == df->Bmask
2887                && sf->BytesPerPixel == 4)
2888             {
2889 #if MMX_ASMBLIT
2890                 if(sf->Rshift % 8 == 0
2891                    && sf->Gshift % 8 == 0
2892                    && sf->Bshift % 8 == 0
2893                    && sf->Ashift % 8 == 0
2894                    && sf->Aloss == 0)
2895                 {
2896                         if(SDL_Has3DNow())
2897                                 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2898                         if(SDL_HasMMX())
2899                                 return BlitRGBtoRGBPixelAlphaMMX;
2900                 }
2901 #endif
2902                 if(sf->Amask == 0xff000000)
2903                 {
2904 #if SDL_ALTIVEC_BLITTERS
2905                         if(!(surface->map->dst->flags & SDL_HWSURFACE)
2906                                 && SDL_HasAltiVec())
2907                                 return BlitRGBtoRGBPixelAlphaAltivec;
2908 #endif
2909 #ifdef __ARM_NEON__
2910                         return BlitARGBtoXRGBalpha_neon;
2911 #endif
2912                         return BlitRGBtoRGBPixelAlpha;
2913                 }
2914             }
2915 #ifdef __ARM_NEON__
2916             if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 &&
2917                 ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) ||
2918                  (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000)))
2919             {
2920                 return BlitABGRtoXRGBalpha_neon;
2921             }
2922 #endif
2923 #if SDL_ALTIVEC_BLITTERS
2924             if (sf->Amask && sf->BytesPerPixel == 4 &&
2925                 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2926                 return Blit32to32PixelAlphaAltivec;
2927             else
2928 #endif
2929                 return BlitNtoNPixelAlpha;
2930
2931         case 3:
2932         default:
2933             return BlitNtoNPixelAlpha;
2934         }
2935     }
2936 }
2937