refactor NEON blit checks
[sdl_omap.git] / src / video / SDL_blit_A.c
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2009 Sam Lantinga
4
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 #include "SDL_video.h"
25 #include "SDL_blit.h"
26
27 /*
28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29    Checking if _mm_free is #defined in malloc.h is is the only way to
30    determine if the Processor Pack is installed, as far as I can tell.
31 */
32
33 #if SDL_ASSEMBLY_ROUTINES
34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 #    define MMX_ASMBLIT 1
36 #    define GCC_ASMBLIT 1
37 #  elif defined(_MSC_VER) && defined(_M_IX86)
38 #    if (_MSC_VER <= 1200)  
39 #      include <malloc.h>   
40 #      if defined(_mm_free)
41 #          define HAVE_MMINTRIN_H 1
42 #      endif
43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
44 #      define HAVE_MMINTRIN_H 1
45 #    endif
46 #    if HAVE_MMINTRIN_H
47 #      define MMX_ASMBLIT 1
48 #      define MSVC_ASMBLIT 1
49 #    endif
50 #  endif
51 #endif /* SDL_ASSEMBLY_ROUTINES */
52
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
55 #if GCC_ASMBLIT
56 #include "mmx.h"
57 #elif MSVC_ASMBLIT
58 #include <mmintrin.h>
59 #include <mm3dnow.h>
60 #endif
61
62 /* Functions to perform alpha blended blitting */
63
64 #ifdef __ARM_NEON__
65
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
70 { \
71         int width = info->d_width; \
72         int height = info->d_height; \
73         Uint8 *src = info->s_pixels; \
74         Uint8 *dst = info->d_pixels; \
75         int srcskip = info->s_skip; \
76         int dstskip = info->d_skip; \
77 \
78         while ( height-- ) { \
79             neon_name(dst, src, width); \
80             src += width * 4 + srcskip; \
81             dst += width * 4 + dstskip; \
82         } \
83 }
84
85 #define make_neon_callerS(name, neon_name) \
86 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
87 static void name(SDL_BlitInfo *info) \
88 { \
89         int width = info->d_width; \
90         int height = info->d_height; \
91         Uint8 *src = info->s_pixels; \
92         Uint8 *dst = info->d_pixels; \
93         int srcskip = info->s_skip; \
94         int dstskip = info->d_skip; \
95         unsigned alpha = info->src->alpha;\
96 \
97         while ( height-- ) { \
98             neon_name(dst, src, width, alpha); \
99             src += width * 4 + srcskip; \
100             dst += width * 4 + dstskip; \
101         } \
102 }
103
104 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
105 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
106 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
107 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
108
109 #endif /* __ARM_NEON__ */
110
111 /* N->1 blending with per-surface alpha */
112 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
113 {
114         int width = info->d_width;
115         int height = info->d_height;
116         Uint8 *src = info->s_pixels;
117         int srcskip = info->s_skip;
118         Uint8 *dst = info->d_pixels;
119         int dstskip = info->d_skip;
120         Uint8 *palmap = info->table;
121         SDL_PixelFormat *srcfmt = info->src;
122         SDL_PixelFormat *dstfmt = info->dst;
123         int srcbpp = srcfmt->BytesPerPixel;
124
125         const unsigned A = srcfmt->alpha;
126
127         while ( height-- ) {
128             DUFFS_LOOP4(
129             {
130                 Uint32 Pixel;
131                 unsigned sR;
132                 unsigned sG;
133                 unsigned sB;
134                 unsigned dR;
135                 unsigned dG;
136                 unsigned dB;
137                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
138                 dR = dstfmt->palette->colors[*dst].r;
139                 dG = dstfmt->palette->colors[*dst].g;
140                 dB = dstfmt->palette->colors[*dst].b;
141                 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
142                 dR &= 0xff;
143                 dG &= 0xff;
144                 dB &= 0xff;
145                 /* Pack RGB into 8bit pixel */
146                 if ( palmap == NULL ) {
147                     *dst =((dR>>5)<<(3+2))|
148                           ((dG>>5)<<(2))|
149                           ((dB>>6)<<(0));
150                 } else {
151                     *dst = palmap[((dR>>5)<<(3+2))|
152                                   ((dG>>5)<<(2))  |
153                                   ((dB>>6)<<(0))];
154                 }
155                 dst++;
156                 src += srcbpp;
157             },
158             width);
159             src += srcskip;
160             dst += dstskip;
161         }
162 }
163
164 /* N->1 blending with pixel alpha */
165 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
166 {
167         int width = info->d_width;
168         int height = info->d_height;
169         Uint8 *src = info->s_pixels;
170         int srcskip = info->s_skip;
171         Uint8 *dst = info->d_pixels;
172         int dstskip = info->d_skip;
173         Uint8 *palmap = info->table;
174         SDL_PixelFormat *srcfmt = info->src;
175         SDL_PixelFormat *dstfmt = info->dst;
176         int srcbpp = srcfmt->BytesPerPixel;
177
178         /* FIXME: fix alpha bit field expansion here too? */
179         while ( height-- ) {
180             DUFFS_LOOP4(
181             {
182                 Uint32 Pixel;
183                 unsigned sR;
184                 unsigned sG;
185                 unsigned sB;
186                 unsigned sA;
187                 unsigned dR;
188                 unsigned dG;
189                 unsigned dB;
190                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
191                 dR = dstfmt->palette->colors[*dst].r;
192                 dG = dstfmt->palette->colors[*dst].g;
193                 dB = dstfmt->palette->colors[*dst].b;
194                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
195                 dR &= 0xff;
196                 dG &= 0xff;
197                 dB &= 0xff;
198                 /* Pack RGB into 8bit pixel */
199                 if ( palmap == NULL ) {
200                     *dst =((dR>>5)<<(3+2))|
201                           ((dG>>5)<<(2))|
202                           ((dB>>6)<<(0));
203                 } else {
204                     *dst = palmap[((dR>>5)<<(3+2))|
205                                   ((dG>>5)<<(2))  |
206                                   ((dB>>6)<<(0))  ];
207                 }
208                 dst++;
209                 src += srcbpp;
210             },
211             width);
212             src += srcskip;
213             dst += dstskip;
214         }
215 }
216
217 /* colorkeyed N->1 blending with per-surface alpha */
218 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
219 {
220         int width = info->d_width;
221         int height = info->d_height;
222         Uint8 *src = info->s_pixels;
223         int srcskip = info->s_skip;
224         Uint8 *dst = info->d_pixels;
225         int dstskip = info->d_skip;
226         Uint8 *palmap = info->table;
227         SDL_PixelFormat *srcfmt = info->src;
228         SDL_PixelFormat *dstfmt = info->dst;
229         int srcbpp = srcfmt->BytesPerPixel;
230         Uint32 ckey = srcfmt->colorkey;
231
232         const int A = srcfmt->alpha;
233
234         while ( height-- ) {
235             DUFFS_LOOP(
236             {
237                 Uint32 Pixel;
238                 unsigned sR;
239                 unsigned sG;
240                 unsigned sB;
241                 unsigned dR;
242                 unsigned dG;
243                 unsigned dB;
244                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
245                 if ( Pixel != ckey ) {
246                     dR = dstfmt->palette->colors[*dst].r;
247                     dG = dstfmt->palette->colors[*dst].g;
248                     dB = dstfmt->palette->colors[*dst].b;
249                     ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
250                     dR &= 0xff;
251                     dG &= 0xff;
252                     dB &= 0xff;
253                     /* Pack RGB into 8bit pixel */
254                     if ( palmap == NULL ) {
255                         *dst =((dR>>5)<<(3+2))|
256                               ((dG>>5)<<(2)) |
257                               ((dB>>6)<<(0));
258                     } else {
259                         *dst = palmap[((dR>>5)<<(3+2))|
260                                       ((dG>>5)<<(2))  |
261                                       ((dB>>6)<<(0))  ];
262                     }
263                 }
264                 dst++;
265                 src += srcbpp;
266             },
267             width);
268             src += srcskip;
269             dst += dstskip;
270         }
271 }
272
273 #if GCC_ASMBLIT
274 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
275 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
276 {
277         int width = info->d_width;
278         int height = info->d_height;
279         Uint32 *srcp = (Uint32 *)info->s_pixels;
280         int srcskip = info->s_skip >> 2;
281         Uint32 *dstp = (Uint32 *)info->d_pixels;
282         int dstskip = info->d_skip >> 2;
283         Uint32 dalpha = info->dst->Amask;
284         Uint64 load;
285
286         load = 0x00fefefe00fefefeULL;/* alpha128 mask */
287         movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
288         load = 0x0001010100010101ULL;/* !alpha128 mask */
289         movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
290         movd_m2r(dalpha, mm7); /* dst alpha mask */
291         punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
292         while(height--) {
293                 DUFFS_LOOP_DOUBLE2(
294                 {
295                         Uint32 s = *srcp++;
296                         Uint32 d = *dstp;
297                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
298                                    + (s & d & 0x00010101)) | dalpha;
299                 },{
300                         movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
301                         movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
302
303                         movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
304                         movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
305
306                         pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
307                         pand_r2r(mm4, mm5); /* src & mask -> mm5 */
308                         paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
309                         pand_r2r(mm1, mm2); /* src & dst -> mm2 */
310                         psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
311                         pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
312                         paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
313                         
314                         por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
315                         movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
316                         dstp += 2;
317                         srcp += 2;
318                 }, width);
319                 srcp += srcskip;
320                 dstp += dstskip;
321         }
322         emms();
323 }
324
325 /* fast RGB888->(A)RGB888 blending with surface alpha */
326 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
327 {
328         SDL_PixelFormat* df = info->dst;
329         unsigned alpha = info->src->alpha;
330
331         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
332                         /* only call a128 version when R,G,B occupy lower bits */
333                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
334         } else {
335                 int width = info->d_width;
336                 int height = info->d_height;
337                 Uint32 *srcp = (Uint32 *)info->s_pixels;
338                 int srcskip = info->s_skip >> 2;
339                 Uint32 *dstp = (Uint32 *)info->d_pixels;
340                 int dstskip = info->d_skip >> 2;
341
342                 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
343                 /* form the alpha mult */
344                 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
345                 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
346                 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
347                 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
348                 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
349                 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
350                 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
351                         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
352                 movd_m2r(df->Amask, mm7); /* dst alpha mask */
353                 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
354                 
355                 while(height--) {
356                         DUFFS_LOOP_DOUBLE2({
357                                 /* One Pixel Blend */
358                                 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
359                                 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
360                                 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
361                                 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
362
363                                 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
364                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
365                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
366                                 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
367
368                                 packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
369                                 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
370                                 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
371                                 ++srcp;
372                                 ++dstp;
373                         },{
374                                 /* Two Pixels Blend */
375                                 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
376                                 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
377                                 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
378                                 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
379
380                                 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
381                                 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
382                                 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
383                                 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
384
385                                 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
386                                 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
387                                 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
388                                 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
389
390                                 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
391                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
392                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
393                                 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
394
395                                 packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
396                                 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
397                                 
398                                 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
399
400                                 srcp += 2;
401                                 dstp += 2;
402                         }, width);
403                         srcp += srcskip;
404                         dstp += dstskip;
405                 }
406                 emms();
407         }
408 }
409
410 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
411 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
412 {
413         int width = info->d_width;
414         int height = info->d_height;
415         Uint32 *srcp = (Uint32 *)info->s_pixels;
416         int srcskip = info->s_skip >> 2;
417         Uint32 *dstp = (Uint32 *)info->d_pixels;
418         int dstskip = info->d_skip >> 2;
419         SDL_PixelFormat* sf = info->src;
420         Uint32 amask = sf->Amask;
421
422         pxor_r2r(mm6, mm6); /* 0 -> mm6 */
423         /* form multiplication mask */
424         movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
425         punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
426         pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
427         movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
428         pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
429         /* form channel masks */
430         movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
431         packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
432         packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
433         pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
434         /* get alpha channel shift */
435         __asm__ __volatile__ (
436                 "movd %0, %%mm5"
437                 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
438
439         while(height--) {
440             DUFFS_LOOP4({
441                 Uint32 alpha = *srcp & amask;
442                 /* FIXME: Here we special-case opaque alpha since the
443                         compositioning used (>>8 instead of /255) doesn't handle
444                         it correctly. Also special-case alpha=0 for speed?
445                         Benchmark this! */
446                 if(alpha == 0) {
447                         /* do nothing */
448                 } else if(alpha == amask) {
449                         /* opaque alpha -- copy RGB, keep dst alpha */
450                         /* using MMX here to free up regular registers for other things */
451                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
452                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
453                         pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
454                         pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
455                         por_r2r(mm1, mm2); /* src | dst -> mm2 */
456                         movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
457                 } else {
458                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
459                         punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
460
461                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
462                         punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
463
464                         __asm__ __volatile__ (
465                                 "movd %0, %%mm4"
466                                 : : "r" (alpha) ); /* 0000A000 -> mm4 */
467                         psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
468                         punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
469                         punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
470                         pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
471
472                         /* blend */                 
473                         psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
474                         pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
475                         psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
476                         paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
477                         
478                         packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
479                         movd_r2m(mm2, *dstp);/* mm2 -> dst */
480                 }
481                 ++srcp;
482                 ++dstp;
483             }, width);
484             srcp += srcskip;
485             dstp += dstskip;
486         }
487         emms();
488 }
489 /* End GCC_ASMBLIT */
490
491 #elif MSVC_ASMBLIT
492 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
493 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
494 {
495         int width = info->d_width;
496         int height = info->d_height;
497         Uint32 *srcp = (Uint32 *)info->s_pixels;
498         int srcskip = info->s_skip >> 2;
499         Uint32 *dstp = (Uint32 *)info->d_pixels;
500         int dstskip = info->d_skip >> 2;
501         Uint32 dalpha = info->dst->Amask;
502
503         __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
504         
505         hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
506         lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
507         dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
508
509         while (height--) {
510                 int n = width;
511                 if ( n & 1 ) {
512                         Uint32 s = *srcp++;
513                         Uint32 d = *dstp;
514                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
515                                    + (s & d & 0x00010101)) | dalpha;
516                         n--;
517                 }
518                 
519                 for (n >>= 1; n > 0; --n) {
520                         dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
521                         dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
522
523                         src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
524                         src2 = src1; /* 2 x src -> src2(ARGBARGB) */
525
526                         dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
527                         src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
528                         src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
529                         src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
530
531                         dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
532                         dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
533                         dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
534                         dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
535                         
536                         *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
537                         dstp += 2;
538                         srcp += 2;
539                 }
540                 
541                 srcp += srcskip;
542                 dstp += dstskip;
543         }
544         _mm_empty();
545 }
546
547 /* fast RGB888->(A)RGB888 blending with surface alpha */
548 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
549 {
550         SDL_PixelFormat* df = info->dst;
551         Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
552         unsigned alpha = info->src->alpha;
553
554         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
555                         /* only call a128 version when R,G,B occupy lower bits */
556                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
557         } else {
558                 int width = info->d_width;
559                 int height = info->d_height;
560                 Uint32 *srcp = (Uint32 *)info->s_pixels;
561                 int srcskip = info->s_skip >> 2;
562                 Uint32 *dstp = (Uint32 *)info->d_pixels;
563                 int dstskip = info->d_skip >> 2;
564                 Uint32 dalpha = df->Amask;
565                 Uint32 amult;
566
567                 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
568                 
569                 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
570                 /* form the alpha mult */
571                 amult = alpha | (alpha << 8);
572                 amult = amult | (amult << 16);
573                 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
574                 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
575                 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
576                         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
577                 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
578                 
579                 while (height--) {
580                         int n = width;
581                         if (n & 1) {
582                                 /* One Pixel Blend */
583                                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
584                                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
585
586                                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
587                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
588
589                                 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
590                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
591                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
592                                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
593                                 
594                                 dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
595                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
596                                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
597
598                                 ++srcp;
599                                 ++dstp;
600                                 
601                                 n--;
602                         }
603
604                         for (n >>= 1; n > 0; --n) {
605                                 /* Two Pixels Blend */
606                                 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
607                                 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
608                                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
609                                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
610
611                                 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
612                                 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
613                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
614                                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
615
616                                 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
617                                 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
618                                 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
619                                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
620
621                                 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
622                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
623                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
624                                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
625                                 
626                                 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
627                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
628
629                                 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
630
631                                 srcp += 2;
632                                 dstp += 2;
633                         }
634                         srcp += srcskip;
635                         dstp += dstskip;
636                 }
637                 _mm_empty();
638         }
639 }
640
641 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
642 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
643 {
644         int width = info->d_width;
645         int height = info->d_height;
646         Uint32 *srcp = (Uint32 *)info->s_pixels;
647         int srcskip = info->s_skip >> 2;
648         Uint32 *dstp = (Uint32 *)info->d_pixels;
649         int dstskip = info->d_skip >> 2;
650         SDL_PixelFormat* sf = info->src;
651         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
652         Uint32 amask = sf->Amask;
653         Uint32 ashift = sf->Ashift;
654         Uint64 multmask;
655
656         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
657
658         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
659         multmask = ~(0xFFFFi64 << (ashift * 2));
660         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
661
662         while(height--) {
663                 DUFFS_LOOP4({
664                 Uint32 alpha = *srcp & amask;
665                 if (alpha == 0) {
666                         /* do nothing */
667                 } else if (alpha == amask) {
668                         /* opaque alpha -- copy RGB, keep dst alpha */
669                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
670                 } else {
671                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
672                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
673
674                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
675                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
676
677                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
678                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
679                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
680                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
681                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
682
683                         /* blend */                 
684                         src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
685                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
686                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
687                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
688                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
689                         
690                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
691                 }
692                 ++srcp;
693                 ++dstp;
694             }, width);
695             srcp += srcskip;
696             dstp += dstskip;
697         }
698         _mm_empty();
699 }
700 /* End MSVC_ASMBLIT */
701
702 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
703
704 #if SDL_ALTIVEC_BLITTERS
705 #if __MWERKS__
706 #pragma altivec_model on
707 #endif
708 #if HAVE_ALTIVEC_H
709 #include <altivec.h>
710 #endif
711 #include <assert.h>
712
713 #if (defined(__MACOSX__) && (__GNUC__ < 4))
714     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
715         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
716     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
717         (vector unsigned short) ( a,b,c,d,e,f,g,h )
718 #else
719     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
720         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
721     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
722         (vector unsigned short) { a,b,c,d,e,f,g,h }
723 #endif
724
725 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
726 #define VECPRINT(msg, v) do { \
727     vector unsigned int tmpvec = (vector unsigned int)(v); \
728     unsigned int *vp = (unsigned int *)&tmpvec; \
729     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
730 } while (0)
731
732 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
733     (vector unsigned char)(
734         0x00, 0x10, 0x02, 0x12,
735         0x04, 0x14, 0x06, 0x16,
736         0x08, 0x18, 0x0A, 0x1A,
737         0x0C, 0x1C, 0x0E, 0x1E );
738 */
739 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
740 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
741 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
742 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
743     ? vec_lvsl(0, src) \
744     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
745
746    
747 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
748     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
749     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
750     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
751     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
752     /* valpha2 is 255-alpha */ \
753     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
754     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
755     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
756     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
757     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
758     /* add source and dest */ \
759     vtemp1 = vec_add(vtemp1, vtemp3); \
760     vtemp2 = vec_add(vtemp2, vtemp4); \
761     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
762     vtemp1 = vec_add(vtemp1, v1_16); \
763     vtemp3 = vec_sr(vtemp1, v8_16); \
764     vtemp1 = vec_add(vtemp1, vtemp3); \
765     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
766     vtemp2 = vec_add(vtemp2, v1_16); \
767     vtemp4 = vec_sr(vtemp2, v8_16); \
768     vtemp2 = vec_add(vtemp2, vtemp4); \
769     /* (>>8) and get ARGBARGBARGBARGB */ \
770     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
771 } while (0)
772  
773 /* Calculate the permute vector used for 32->32 swizzling */
774 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
775                                   const SDL_PixelFormat *dstfmt)
776 {
777     /*
778      * We have to assume that the bits that aren't used by other
779      *  colors is alpha, and it's one complete byte, since some formats
780      *  leave alpha with a zero mask, but we should still swizzle the bits.
781      */
782     /* ARGB */
783     const static struct SDL_PixelFormat default_pixel_format = {
784         NULL, 0, 0,
785         0, 0, 0, 0,
786         16, 8, 0, 24,
787         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
788         0, 0};
789     if (!srcfmt) {
790         srcfmt = &default_pixel_format;
791     }
792     if (!dstfmt) {
793         dstfmt = &default_pixel_format;
794     }
795     const vector unsigned char plus = VECUINT8_LITERAL
796                                             ( 0x00, 0x00, 0x00, 0x00,
797                                               0x04, 0x04, 0x04, 0x04,
798                                               0x08, 0x08, 0x08, 0x08,
799                                               0x0C, 0x0C, 0x0C, 0x0C );
800     vector unsigned char vswiz;
801     vector unsigned int srcvec;
802 #define RESHIFT(X) (3 - ((X) >> 3))
803     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
804     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
805     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
806     Uint32 amask;
807     /* Use zero for alpha if either surface doesn't have alpha */
808     if (dstfmt->Amask) {
809         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
810     } else {
811         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
812     }
813 #undef RESHIFT  
814     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
815     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
816     return(vswiz);
817 }
818
819 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
820 {
821     int height = info->d_height;
822     Uint8 *src = (Uint8 *)info->s_pixels;
823     int srcskip = info->s_skip;
824     Uint8 *dst = (Uint8 *)info->d_pixels;
825     int dstskip = info->d_skip;
826     SDL_PixelFormat *srcfmt = info->src;
827
828     vector unsigned char v0 = vec_splat_u8(0);
829     vector unsigned short v8_16 = vec_splat_u16(8);
830     vector unsigned short v1_16 = vec_splat_u16(1);
831     vector unsigned short v2_16 = vec_splat_u16(2);
832     vector unsigned short v3_16 = vec_splat_u16(3);
833     vector unsigned int v8_32 = vec_splat_u32(8);
834     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
835     vector unsigned short v3f = VECUINT16_LITERAL(
836         0x003f, 0x003f, 0x003f, 0x003f,
837         0x003f, 0x003f, 0x003f, 0x003f);
838     vector unsigned short vfc = VECUINT16_LITERAL(
839         0x00fc, 0x00fc, 0x00fc, 0x00fc,
840         0x00fc, 0x00fc, 0x00fc, 0x00fc);
841
842     /* 
843         0x10 - 0x1f is the alpha
844         0x00 - 0x0e evens are the red
845         0x01 - 0x0f odds are zero
846     */
847     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
848         0x10, 0x00, 0x01, 0x01,
849         0x10, 0x02, 0x01, 0x01,
850         0x10, 0x04, 0x01, 0x01,
851         0x10, 0x06, 0x01, 0x01
852     );
853     vector unsigned char vredalpha2 = (vector unsigned char)(
854         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
855     );
856     /*
857         0x00 - 0x0f is ARxx ARxx ARxx ARxx
858         0x11 - 0x0f odds are blue
859     */
860     vector unsigned char vblue1 = VECUINT8_LITERAL(
861         0x00, 0x01, 0x02, 0x11,
862         0x04, 0x05, 0x06, 0x13,
863         0x08, 0x09, 0x0a, 0x15,
864         0x0c, 0x0d, 0x0e, 0x17
865     );
866     vector unsigned char vblue2 = (vector unsigned char)(
867         vec_add((vector unsigned int)vblue1, v8_32)
868     );
869     /*
870         0x00 - 0x0f is ARxB ARxB ARxB ARxB
871         0x10 - 0x0e evens are green
872     */
873     vector unsigned char vgreen1 = VECUINT8_LITERAL(
874         0x00, 0x01, 0x10, 0x03,
875         0x04, 0x05, 0x12, 0x07,
876         0x08, 0x09, 0x14, 0x0b,
877         0x0c, 0x0d, 0x16, 0x0f
878     );
879     vector unsigned char vgreen2 = (vector unsigned char)(
880         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
881     );
882     vector unsigned char vgmerge = VECUINT8_LITERAL(
883         0x00, 0x02, 0x00, 0x06,
884         0x00, 0x0a, 0x00, 0x0e,
885         0x00, 0x12, 0x00, 0x16,
886         0x00, 0x1a, 0x00, 0x1e);
887     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
888     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
889     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
890
891     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
892     vf800 = vec_sl(vf800, vec_splat_u16(8));
893
894     while(height--) {
895         int extrawidth;
896         vector unsigned char valigner;
897         vector unsigned char vsrc;
898         vector unsigned char voverflow;
899         int width = info->d_width;
900
901 #define ONE_PIXEL_BLEND(condition, widthvar) \
902         while (condition) { \
903             Uint32 Pixel; \
904             unsigned sR, sG, sB, dR, dG, dB, sA; \
905             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
906             if(sA) { \
907                 unsigned short dstpixel = *((unsigned short *)dst); \
908                 dR = (dstpixel >> 8) & 0xf8; \
909                 dG = (dstpixel >> 3) & 0xfc; \
910                 dB = (dstpixel << 3) & 0xf8; \
911                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
912                 *((unsigned short *)dst) = ( \
913                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
914                 ); \
915             } \
916             src += 4; \
917             dst += 2; \
918             widthvar--; \
919         }
920         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
921         extrawidth = (width % 8);
922         valigner = VEC_ALIGNER(src);
923         vsrc = (vector unsigned char)vec_ld(0, src);
924         width -= extrawidth;
925         while (width) {
926             vector unsigned char valpha;
927             vector unsigned char vsrc1, vsrc2;
928             vector unsigned char vdst1, vdst2;
929             vector unsigned short vR, vG, vB;
930             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
931
932             /* Load 8 pixels from src as ARGB */
933             voverflow = (vector unsigned char)vec_ld(15, src);
934             vsrc = vec_perm(vsrc, voverflow, valigner);
935             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
936             src += 16;
937             vsrc = (vector unsigned char)vec_ld(15, src);
938             voverflow = vec_perm(voverflow, vsrc, valigner);
939             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
940             src += 16;
941
942             /* Load 8 pixels from dst as XRGB */
943             voverflow = vec_ld(0, dst);
944             vR = vec_and((vector unsigned short)voverflow, vf800);
945             vB = vec_sl((vector unsigned short)voverflow, v3_16);
946             vG = vec_sl(vB, v2_16);
947             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
948             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
949             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
950             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
951             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
952             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
953
954             /* Alpha blend 8 pixels as ARGB */
955             valpha = vec_perm(vsrc1, v0, valphaPermute);
956             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
957             valpha = vec_perm(vsrc2, v0, valphaPermute);
958             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
959
960             /* Convert 8 pixels to 565 */
961             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
962             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
963             vgpixel = vec_and(vgpixel, vfc);
964             vgpixel = vec_sl(vgpixel, v3_16);
965             vrpixel = vec_sl(vpixel, v1_16);
966             vrpixel = vec_and(vrpixel, vf800);
967             vbpixel = vec_and(vpixel, v3f);
968             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
969             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
970             
971             /* Store 8 pixels */
972             vec_st(vdst1, 0, dst);
973
974             width -= 8;
975             dst += 16;
976         }
977         ONE_PIXEL_BLEND((extrawidth), extrawidth);
978 #undef ONE_PIXEL_BLEND
979         src += srcskip;
980         dst += dstskip;
981     }
982 }
983
984 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
985 {
986     unsigned alpha = info->src->alpha;
987     int height = info->d_height;
988     Uint32 *srcp = (Uint32 *)info->s_pixels;
989     int srcskip = info->s_skip >> 2;
990     Uint32 *dstp = (Uint32 *)info->d_pixels;
991     int dstskip = info->d_skip >> 2;
992     SDL_PixelFormat *srcfmt = info->src;
993     SDL_PixelFormat *dstfmt = info->dst;
994     unsigned sA = srcfmt->alpha;
995     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
996     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
997     Uint32 ckey = info->src->colorkey;
998     vector unsigned char mergePermute;
999     vector unsigned char vsrcPermute;
1000     vector unsigned char vdstPermute;
1001     vector unsigned char vsdstPermute;
1002     vector unsigned char valpha;
1003     vector unsigned char valphamask;
1004     vector unsigned char vbits;
1005     vector unsigned char v0;
1006     vector unsigned short v1;
1007     vector unsigned short v8;
1008     vector unsigned int vckey;
1009     vector unsigned int vrgbmask;
1010
1011     mergePermute = VEC_MERGE_PERMUTE();
1012     v0 = vec_splat_u8(0);
1013     v1 = vec_splat_u16(1);
1014     v8 = vec_splat_u16(8);
1015
1016     /* set the alpha to 255 on the destination surf */
1017     valphamask = VEC_ALPHA_MASK();
1018
1019     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1020     vdstPermute = calc_swizzle32(NULL, dstfmt);
1021     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1022
1023     /* set a vector full of alpha and 255-alpha */
1024     ((unsigned char *)&valpha)[0] = alpha;
1025     valpha = vec_splat(valpha, 0);
1026     vbits = (vector unsigned char)vec_splat_s8(-1);
1027
1028     ckey &= rgbmask;
1029     ((unsigned int *)(char*)&vckey)[0] = ckey;
1030     vckey = vec_splat(vckey, 0);
1031     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1032     vrgbmask = vec_splat(vrgbmask, 0);
1033
1034     while(height--) {
1035         int width = info->d_width;
1036 #define ONE_PIXEL_BLEND(condition, widthvar) \
1037         while (condition) { \
1038             Uint32 Pixel; \
1039             unsigned sR, sG, sB, dR, dG, dB; \
1040             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1041             if(sA && Pixel != ckey) { \
1042                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1043                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1044                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1045                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1046             } \
1047             dstp++; \
1048             srcp++; \
1049             widthvar--; \
1050         }
1051         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1052         if (width > 0) {
1053             int extrawidth = (width % 4);
1054             vector unsigned char valigner = VEC_ALIGNER(srcp);
1055             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1056             width -= extrawidth;
1057             while (width) {
1058                 vector unsigned char vsel;
1059                 vector unsigned char voverflow;
1060                 vector unsigned char vd;
1061                 vector unsigned char vd_orig;
1062
1063                 /* s = *srcp */
1064                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1065                 vs = vec_perm(vs, voverflow, valigner);
1066                 
1067                 /* vsel is set for items that match the key */
1068                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1069                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1070
1071                 /* permute to source format */
1072                 vs = vec_perm(vs, valpha, vsrcPermute);
1073
1074                 /* d = *dstp */
1075                 vd = (vector unsigned char)vec_ld(0, dstp);
1076                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1077
1078                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1079
1080                 /* set the alpha channel to full on */
1081                 vd = vec_or(vd, valphamask);
1082
1083                 /* mask out color key */
1084                 vd = vec_sel(vd, vd_orig, vsel);
1085                 
1086                 /* permute to dest format */
1087                 vd = vec_perm(vd, vbits, vdstPermute);
1088
1089                 /* *dstp = res */
1090                 vec_st((vector unsigned int)vd, 0, dstp);
1091                 
1092                 srcp += 4;
1093                 dstp += 4;
1094                 width -= 4;
1095                 vs = voverflow;
1096             }
1097             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1098         }
1099 #undef ONE_PIXEL_BLEND
1100  
1101         srcp += srcskip;
1102         dstp += dstskip;
1103     }
1104 }
1105
1106
1107 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1108 {
1109     int width = info->d_width;
1110     int height = info->d_height;
1111     Uint32 *srcp = (Uint32 *)info->s_pixels;
1112     int srcskip = info->s_skip >> 2;
1113     Uint32 *dstp = (Uint32 *)info->d_pixels;
1114     int dstskip = info->d_skip >> 2;
1115     SDL_PixelFormat *srcfmt = info->src;
1116     SDL_PixelFormat *dstfmt = info->dst;
1117     vector unsigned char mergePermute;
1118     vector unsigned char valphaPermute;
1119     vector unsigned char vsrcPermute;
1120     vector unsigned char vdstPermute;
1121     vector unsigned char vsdstPermute;
1122     vector unsigned char valphamask;
1123     vector unsigned char vpixelmask;
1124     vector unsigned char v0;
1125     vector unsigned short v1;
1126     vector unsigned short v8;
1127
1128     v0 = vec_splat_u8(0);
1129     v1 = vec_splat_u16(1);
1130     v8 = vec_splat_u16(8);
1131     mergePermute = VEC_MERGE_PERMUTE();
1132     valphamask = VEC_ALPHA_MASK();
1133     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1134     vpixelmask = vec_nor(valphamask, v0);
1135     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1136     vdstPermute = calc_swizzle32(NULL, dstfmt);
1137     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1138
1139         while ( height-- ) {
1140         width = info->d_width;
1141 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1142             Uint32 Pixel; \
1143             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1144             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1145             if(sA) { \
1146               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1147               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1148               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1149             } \
1150             ++srcp; \
1151             ++dstp; \
1152             widthvar--; \
1153         }
1154         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1155         if (width > 0) {
1156             /* vsrcPermute */
1157             /* vdstPermute */
1158             int extrawidth = (width % 4);
1159             vector unsigned char valigner = VEC_ALIGNER(srcp);
1160             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1161             width -= extrawidth;
1162             while (width) {
1163                 vector unsigned char voverflow;
1164                 vector unsigned char vd;
1165                 vector unsigned char valpha;
1166                 vector unsigned char vdstalpha;
1167                 /* s = *srcp */
1168                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1169                 vs = vec_perm(vs, voverflow, valigner);
1170                 vs = vec_perm(vs, v0, vsrcPermute);
1171
1172                 valpha = vec_perm(vs, v0, valphaPermute);
1173                 
1174                 /* d = *dstp */
1175                 vd = (vector unsigned char)vec_ld(0, dstp);
1176                 vd = vec_perm(vd, v0, vsdstPermute);
1177                 vdstalpha = vec_and(vd, valphamask);
1178
1179                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1180
1181                 /* set the alpha to the dest alpha */
1182                 vd = vec_and(vd, vpixelmask);
1183                 vd = vec_or(vd, vdstalpha);
1184                 vd = vec_perm(vd, v0, vdstPermute);
1185
1186                 /* *dstp = res */
1187                 vec_st((vector unsigned int)vd, 0, dstp);
1188                 
1189                 srcp += 4;
1190                 dstp += 4;
1191                 width -= 4;
1192                 vs = voverflow;
1193
1194             }
1195             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1196         }
1197             srcp += srcskip;
1198             dstp += dstskip;
1199 #undef ONE_PIXEL_BLEND
1200         }
1201 }
1202
1203 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1204 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1205 {
1206         int width = info->d_width;
1207         int height = info->d_height;
1208         Uint32 *srcp = (Uint32 *)info->s_pixels;
1209         int srcskip = info->s_skip >> 2;
1210         Uint32 *dstp = (Uint32 *)info->d_pixels;
1211         int dstskip = info->d_skip >> 2;
1212     vector unsigned char mergePermute;
1213     vector unsigned char valphaPermute;
1214     vector unsigned char valphamask;
1215     vector unsigned char vpixelmask;
1216     vector unsigned char v0;
1217     vector unsigned short v1;
1218     vector unsigned short v8;
1219     v0 = vec_splat_u8(0);
1220     v1 = vec_splat_u16(1);
1221     v8 = vec_splat_u16(8);
1222     mergePermute = VEC_MERGE_PERMUTE();
1223     valphamask = VEC_ALPHA_MASK();
1224     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1225     
1226  
1227     vpixelmask = vec_nor(valphamask, v0);
1228         while(height--) {
1229         width = info->d_width;
1230 #define ONE_PIXEL_BLEND(condition, widthvar) \
1231         while ((condition)) { \
1232             Uint32 dalpha; \
1233             Uint32 d; \
1234             Uint32 s1; \
1235             Uint32 d1; \
1236             Uint32 s = *srcp; \
1237             Uint32 alpha = s >> 24; \
1238             if(alpha) { \
1239               if(alpha == SDL_ALPHA_OPAQUE) { \
1240                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1241               } else { \
1242                 d = *dstp; \
1243                 dalpha = d & 0xff000000; \
1244                 s1 = s & 0xff00ff; \
1245                 d1 = d & 0xff00ff; \
1246                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1247                 s &= 0xff00; \
1248                 d &= 0xff00; \
1249                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1250                 *dstp = d1 | d | dalpha; \
1251               } \
1252             } \
1253             ++srcp; \
1254             ++dstp; \
1255             widthvar--; \
1256             }
1257         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1258         if (width > 0) {
1259             int extrawidth = (width % 4);
1260             vector unsigned char valigner = VEC_ALIGNER(srcp);
1261             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1262             width -= extrawidth;
1263             while (width) {
1264                 vector unsigned char voverflow;
1265                 vector unsigned char vd;
1266                 vector unsigned char valpha;
1267                 vector unsigned char vdstalpha;
1268                 /* s = *srcp */
1269                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1270                 vs = vec_perm(vs, voverflow, valigner);
1271
1272                 valpha = vec_perm(vs, v0, valphaPermute);
1273                 
1274                 /* d = *dstp */
1275                 vd = (vector unsigned char)vec_ld(0, dstp);
1276                 vdstalpha = vec_and(vd, valphamask);
1277
1278                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1279
1280                 /* set the alpha to the dest alpha */
1281                 vd = vec_and(vd, vpixelmask);
1282                 vd = vec_or(vd, vdstalpha);
1283
1284                 /* *dstp = res */
1285                 vec_st((vector unsigned int)vd, 0, dstp);
1286                 
1287                 srcp += 4;
1288                 dstp += 4;
1289                 width -= 4;
1290                 vs = voverflow;
1291             }
1292             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1293         }
1294             srcp += srcskip;
1295             dstp += dstskip;
1296         }
1297 #undef ONE_PIXEL_BLEND
1298 }
1299
1300 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1301 {
1302     /* XXX : 6 */
1303         unsigned alpha = info->src->alpha;
1304     int height = info->d_height;
1305     Uint32 *srcp = (Uint32 *)info->s_pixels;
1306     int srcskip = info->s_skip >> 2;
1307     Uint32 *dstp = (Uint32 *)info->d_pixels;
1308     int dstskip = info->d_skip >> 2;
1309     SDL_PixelFormat *srcfmt = info->src;
1310     SDL_PixelFormat *dstfmt = info->dst;
1311         unsigned sA = srcfmt->alpha;
1312         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1313     vector unsigned char mergePermute;
1314     vector unsigned char vsrcPermute;
1315     vector unsigned char vdstPermute;
1316     vector unsigned char vsdstPermute;
1317     vector unsigned char valpha;
1318     vector unsigned char valphamask;
1319     vector unsigned char vbits;
1320     vector unsigned short v1;
1321     vector unsigned short v8;
1322
1323     mergePermute = VEC_MERGE_PERMUTE();
1324     v1 = vec_splat_u16(1);
1325     v8 = vec_splat_u16(8);
1326
1327     /* set the alpha to 255 on the destination surf */
1328     valphamask = VEC_ALPHA_MASK();
1329
1330     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1331     vdstPermute = calc_swizzle32(NULL, dstfmt);
1332     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1333
1334     /* set a vector full of alpha and 255-alpha */
1335     ((unsigned char *)&valpha)[0] = alpha;
1336     valpha = vec_splat(valpha, 0);
1337     vbits = (vector unsigned char)vec_splat_s8(-1);
1338
1339     while(height--) {
1340         int width = info->d_width;
1341 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1342             Uint32 Pixel; \
1343             unsigned sR, sG, sB, dR, dG, dB; \
1344             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1345             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1346             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1347             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1348             ++srcp; \
1349             ++dstp; \
1350             widthvar--; \
1351         }
1352         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1353         if (width > 0) {
1354             int extrawidth = (width % 4);
1355             vector unsigned char valigner = VEC_ALIGNER(srcp);
1356             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1357             width -= extrawidth;
1358             while (width) {
1359                 vector unsigned char voverflow;
1360                 vector unsigned char vd;
1361
1362                 /* s = *srcp */
1363                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1364                 vs = vec_perm(vs, voverflow, valigner);
1365                 vs = vec_perm(vs, valpha, vsrcPermute);
1366                 
1367                 /* d = *dstp */
1368                 vd = (vector unsigned char)vec_ld(0, dstp);
1369                 vd = vec_perm(vd, vd, vsdstPermute);
1370
1371                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1372
1373                 /* set the alpha channel to full on */
1374                 vd = vec_or(vd, valphamask);
1375                 vd = vec_perm(vd, vbits, vdstPermute);
1376
1377                 /* *dstp = res */
1378                 vec_st((vector unsigned int)vd, 0, dstp);
1379                 
1380                 srcp += 4;
1381                 dstp += 4;
1382                 width -= 4;
1383                 vs = voverflow;
1384             }
1385             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1386         }
1387 #undef ONE_PIXEL_BLEND
1388  
1389         srcp += srcskip;
1390         dstp += dstskip;
1391     }
1392
1393 }
1394
1395
1396 /* fast RGB888->(A)RGB888 blending */
1397 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1398 {
1399         unsigned alpha = info->src->alpha;
1400     int height = info->d_height;
1401     Uint32 *srcp = (Uint32 *)info->s_pixels;
1402     int srcskip = info->s_skip >> 2;
1403     Uint32 *dstp = (Uint32 *)info->d_pixels;
1404     int dstskip = info->d_skip >> 2;
1405     vector unsigned char mergePermute;
1406     vector unsigned char valpha;
1407     vector unsigned char valphamask;
1408     vector unsigned short v1;
1409     vector unsigned short v8;
1410
1411     mergePermute = VEC_MERGE_PERMUTE();
1412     v1 = vec_splat_u16(1);
1413     v8 = vec_splat_u16(8);
1414
1415     /* set the alpha to 255 on the destination surf */
1416     valphamask = VEC_ALPHA_MASK();
1417
1418     /* set a vector full of alpha and 255-alpha */
1419     ((unsigned char *)&valpha)[0] = alpha;
1420     valpha = vec_splat(valpha, 0);
1421
1422     while(height--) {
1423         int width = info->d_width;
1424 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1425             Uint32 s = *srcp; \
1426             Uint32 d = *dstp; \
1427             Uint32 s1 = s & 0xff00ff; \
1428             Uint32 d1 = d & 0xff00ff; \
1429             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1430                  & 0xff00ff; \
1431             s &= 0xff00; \
1432             d &= 0xff00; \
1433             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1434             *dstp = d1 | d | 0xff000000; \
1435             ++srcp; \
1436             ++dstp; \
1437             widthvar--; \
1438         }
1439         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1440         if (width > 0) {
1441             int extrawidth = (width % 4);
1442             vector unsigned char valigner = VEC_ALIGNER(srcp);
1443             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1444             width -= extrawidth;
1445             while (width) {
1446                 vector unsigned char voverflow;
1447                 vector unsigned char vd;
1448
1449                 /* s = *srcp */
1450                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1451                 vs = vec_perm(vs, voverflow, valigner);
1452                 
1453                 /* d = *dstp */
1454                 vd = (vector unsigned char)vec_ld(0, dstp);
1455
1456                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1457
1458                 /* set the alpha channel to full on */
1459                 vd = vec_or(vd, valphamask);
1460
1461                 /* *dstp = res */
1462                 vec_st((vector unsigned int)vd, 0, dstp);
1463                 
1464                 srcp += 4;
1465                 dstp += 4;
1466                 width -= 4;
1467                 vs = voverflow;
1468             }
1469             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1470         }
1471 #undef ONE_PIXEL_BLEND
1472  
1473         srcp += srcskip;
1474         dstp += dstskip;
1475     }
1476 }
1477 #if __MWERKS__
1478 #pragma altivec_model off
1479 #endif
1480 #endif /* SDL_ALTIVEC_BLITTERS */
1481
1482 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1483 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1484 {
1485         int width = info->d_width;
1486         int height = info->d_height;
1487         Uint32 *srcp = (Uint32 *)info->s_pixels;
1488         int srcskip = info->s_skip >> 2;
1489         Uint32 *dstp = (Uint32 *)info->d_pixels;
1490         int dstskip = info->d_skip >> 2;
1491
1492         while(height--) {
1493             DUFFS_LOOP4({
1494                     Uint32 s = *srcp++;
1495                     Uint32 d = *dstp;
1496                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1497                                + (s & d & 0x00010101)) | 0xff000000;
1498             }, width);
1499             srcp += srcskip;
1500             dstp += dstskip;
1501         }
1502 }
1503
1504 /* fast RGB888->(A)RGB888 blending with surface alpha */
1505 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1506 {
1507         unsigned alpha = info->src->alpha;
1508         if(alpha == 128) {
1509                 BlitRGBtoRGBSurfaceAlpha128(info);
1510         } else {
1511                 int width = info->d_width;
1512                 int height = info->d_height;
1513                 Uint32 *srcp = (Uint32 *)info->s_pixels;
1514                 int srcskip = info->s_skip >> 2;
1515                 Uint32 *dstp = (Uint32 *)info->d_pixels;
1516                 int dstskip = info->d_skip >> 2;
1517                 Uint32 s;
1518                 Uint32 d;
1519                 Uint32 s1;
1520                 Uint32 d1;
1521
1522                 while(height--) {
1523                         DUFFS_LOOP_DOUBLE2({
1524                                 /* One Pixel Blend */
1525                                 s = *srcp;
1526                                 d = *dstp;
1527                                 s1 = s & 0xff00ff;
1528                                 d1 = d & 0xff00ff;
1529                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1530                                      & 0xff00ff;
1531                                 s &= 0xff00;
1532                                 d &= 0xff00;
1533                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1534                                 *dstp = d1 | d | 0xff000000;
1535                                 ++srcp;
1536                                 ++dstp;
1537                         },{
1538                                 /* Two Pixels Blend */
1539                                 s = *srcp;
1540                                 d = *dstp;
1541                                 s1 = s & 0xff00ff;
1542                                 d1 = d & 0xff00ff;
1543                                 d1 += (s1 - d1) * alpha >> 8;
1544                                 d1 &= 0xff00ff;
1545                                      
1546                                 s = ((s & 0xff00) >> 8) | 
1547                                         ((srcp[1] & 0xff00) << 8);
1548                                 d = ((d & 0xff00) >> 8) |
1549                                         ((dstp[1] & 0xff00) << 8);
1550                                 d += (s - d) * alpha >> 8;
1551                                 d &= 0x00ff00ff;
1552                                 
1553                                 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1554                                 ++srcp;
1555                                 
1556                                 s1 = *srcp;
1557                                 d1 = *dstp;
1558                                 s1 &= 0xff00ff;
1559                                 d1 &= 0xff00ff;
1560                                 d1 += (s1 - d1) * alpha >> 8;
1561                                 d1 &= 0xff00ff;
1562                                 
1563                                 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1564                                 ++srcp;
1565                                 ++dstp;
1566                         }, width);
1567                         srcp += srcskip;
1568                         dstp += dstskip;
1569                 }
1570         }
1571 }
1572
1573 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1574 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1575 {
1576         int width = info->d_width;
1577         int height = info->d_height;
1578         Uint32 *srcp = (Uint32 *)info->s_pixels;
1579         int srcskip = info->s_skip >> 2;
1580         Uint32 *dstp = (Uint32 *)info->d_pixels;
1581         int dstskip = info->d_skip >> 2;
1582
1583         while(height--) {
1584             DUFFS_LOOP4({
1585                 Uint32 dalpha;
1586                 Uint32 d;
1587                 Uint32 s1;
1588                 Uint32 d1;
1589                 Uint32 s = *srcp;
1590                 Uint32 alpha = s >> 24;
1591                 /* FIXME: Here we special-case opaque alpha since the
1592                    compositioning used (>>8 instead of /255) doesn't handle
1593                    it correctly. Also special-case alpha=0 for speed?
1594                    Benchmark this! */
1595                 if(alpha) {   
1596                   if(alpha == SDL_ALPHA_OPAQUE) {
1597                     *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1598                   } else {
1599                     /*
1600                      * take out the middle component (green), and process
1601                      * the other two in parallel. One multiply less.
1602                      */
1603                     d = *dstp;
1604                     dalpha = d & 0xff000000;
1605                     s1 = s & 0xff00ff;
1606                     d1 = d & 0xff00ff;
1607                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1608                     s &= 0xff00;
1609                     d &= 0xff00;
1610                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1611                     *dstp = d1 | d | dalpha;
1612                   }
1613                 }
1614                 ++srcp;
1615                 ++dstp;
1616             }, width);
1617             srcp += srcskip;
1618             dstp += dstskip;
1619         }
1620 }
1621
1622 #if GCC_ASMBLIT
1623 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1624 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1625 {
1626         int width = info->d_width;
1627         int height = info->d_height;
1628         Uint32 *srcp = (Uint32 *)info->s_pixels;
1629         int srcskip = info->s_skip >> 2;
1630         Uint32 *dstp = (Uint32 *)info->d_pixels;
1631         int dstskip = info->d_skip >> 2;
1632         SDL_PixelFormat* sf = info->src;
1633         Uint32 amask = sf->Amask;
1634
1635         __asm__ (
1636         /* make mm6 all zeros. */
1637         "pxor       %%mm6, %%mm6\n"
1638         
1639         /* Make a mask to preserve the alpha. */
1640         "movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1641         "punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1642         "pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1643         "movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1644         "pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1645
1646         /* form channel masks */
1647         "movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1648         "packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1649         "packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1650         "pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1651         
1652         /* get alpha channel shift */
1653         "movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1654
1655           : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1656
1657         while(height--) {
1658
1659             DUFFS_LOOP4({
1660                 Uint32 alpha;
1661
1662                 __asm__ (
1663                 "prefetch 64(%0)\n"
1664                 "prefetch 64(%1)\n"
1665                         : : "r" (srcp), "r" (dstp) );
1666
1667                 alpha = *srcp & amask;
1668                 /* FIXME: Here we special-case opaque alpha since the
1669                    compositioning used (>>8 instead of /255) doesn't handle
1670                    it correctly. Also special-case alpha=0 for speed?
1671                    Benchmark this! */
1672                 if(alpha == 0) {
1673                     /* do nothing */
1674                 }
1675                 else if(alpha == amask) {
1676                         /* opaque alpha -- copy RGB, keep dst alpha */
1677                     /* using MMX here to free up regular registers for other things */
1678                             __asm__ (
1679                     "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1680                     "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1681                     "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1682                     "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1683                     "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1684                     "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1685
1686                      : : "r" (srcp), "r" (dstp) );
1687                 } 
1688
1689                 else {
1690                             __asm__ (
1691                     /* load in the source, and dst. */
1692                     "movd      (%0), %%mm0\n"               /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1693                     "movd      (%1), %%mm1\n"               /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1694
1695                     /* Move the src alpha into mm2 */
1696
1697                     /* if supporting pshufw */
1698                     /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1699                     /*"psrlw     $8, %%mm2\n" */
1700                     
1701                     /* else: */
1702                     "movd       %2,    %%mm2\n"
1703                     "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1704                     "punpcklwd  %%mm2, %%mm2\n"             /* mm2 = 0 0 0 0 |  0 As  0  As */
1705                     "punpckldq  %%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1706                     "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1707
1708                     /* move the colors into words. */
1709                     "punpcklbw %%mm6, %%mm0\n"              /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1710                     "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1711
1712                     /* src - dst */
1713                     "psubw    %%mm1, %%mm0\n"               /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1714
1715                     /* A * (src-dst) */
1716                     "pmullw    %%mm2, %%mm0\n"              /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1717                     "psrlw     $8,    %%mm0\n"              /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1718                     "paddb     %%mm1, %%mm0\n"              /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1719
1720                     "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1721                     
1722                     "movd      %%mm0, (%1)\n"               /* result in mm0 */
1723
1724                      : : "r" (srcp), "r" (dstp), "r" (alpha) );
1725
1726                 }
1727                 ++srcp;
1728                 ++dstp;
1729             }, width);
1730             srcp += srcskip;
1731             dstp += dstskip;
1732         }
1733
1734         __asm__ (
1735         "emms\n"
1736                 :   );
1737 }
1738 /* End GCC_ASMBLIT*/
1739
1740 #elif MSVC_ASMBLIT
1741 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1742 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1743 {
1744         int width = info->d_width;
1745         int height = info->d_height;
1746         Uint32 *srcp = (Uint32 *)info->s_pixels;
1747         int srcskip = info->s_skip >> 2;
1748         Uint32 *dstp = (Uint32 *)info->d_pixels;
1749         int dstskip = info->d_skip >> 2;
1750         SDL_PixelFormat* sf = info->src;
1751         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1752         Uint32 amask = sf->Amask;
1753         Uint32 ashift = sf->Ashift;
1754         Uint64 multmask;
1755         
1756         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1757
1758         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1759         multmask = ~(0xFFFFi64 << (ashift * 2));
1760         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1761
1762         while(height--) {
1763             DUFFS_LOOP4({
1764                 Uint32 alpha;
1765
1766                 _m_prefetch(srcp + 16);
1767                 _m_prefetch(dstp + 16);
1768
1769                 alpha = *srcp & amask;
1770                 if (alpha == 0) {
1771                         /* do nothing */
1772                 } else if (alpha == amask) {
1773                         /* copy RGB, keep dst alpha */
1774                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1775                 } else {
1776                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1777                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1778
1779                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1780                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1781
1782                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1783                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1784                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1785                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1786                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1787
1788                         /* blend */                 
1789                         src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1790                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1791                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1792                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1793                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1794                         
1795                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1796                 }
1797                 ++srcp;
1798                 ++dstp;
1799             }, width);
1800             srcp += srcskip;
1801             dstp += dstskip;
1802         }
1803         _mm_empty();
1804 }
1805 /* End MSVC_ASMBLIT */
1806
1807 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1808
1809 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1810
1811 /* blend a single 16 bit pixel at 50% */
1812 #define BLEND16_50(d, s, mask)                                          \
1813         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1814
1815 /* blend two 16 bit pixels at 50% */
1816 #define BLEND2x16_50(d, s, mask)                                             \
1817         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1818          + (s & d & (~(mask | mask << 16))))
1819
1820 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1821 {
1822         int width = info->d_width;
1823         int height = info->d_height;
1824         Uint16 *srcp = (Uint16 *)info->s_pixels;
1825         int srcskip = info->s_skip >> 1;
1826         Uint16 *dstp = (Uint16 *)info->d_pixels;
1827         int dstskip = info->d_skip >> 1;
1828
1829         while(height--) {
1830                 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1831                         /*
1832                          * Source and destination not aligned, pipeline it.
1833                          * This is mostly a win for big blits but no loss for
1834                          * small ones
1835                          */
1836                         Uint32 prev_sw;
1837                         int w = width;
1838
1839                         /* handle odd destination */
1840                         if((uintptr_t)dstp & 2) {
1841                                 Uint16 d = *dstp, s = *srcp;
1842                                 *dstp = BLEND16_50(d, s, mask);
1843                                 dstp++;
1844                                 srcp++;
1845                                 w--;
1846                         }
1847                         srcp++; /* srcp is now 32-bit aligned */
1848
1849                         /* bootstrap pipeline with first halfword */
1850                         prev_sw = ((Uint32 *)srcp)[-1];
1851
1852                         while(w > 1) {
1853                                 Uint32 sw, dw, s;
1854                                 sw = *(Uint32 *)srcp;
1855                                 dw = *(Uint32 *)dstp;
1856 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1857                                 s = (prev_sw << 16) + (sw >> 16);
1858 #else
1859                                 s = (prev_sw >> 16) + (sw << 16);
1860 #endif
1861                                 prev_sw = sw;
1862                                 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1863                                 dstp += 2;
1864                                 srcp += 2;
1865                                 w -= 2;
1866                         }
1867
1868                         /* final pixel if any */
1869                         if(w) {
1870                                 Uint16 d = *dstp, s;
1871 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1872                                 s = (Uint16)prev_sw;
1873 #else
1874                                 s = (Uint16)(prev_sw >> 16);
1875 #endif
1876                                 *dstp = BLEND16_50(d, s, mask);
1877                                 srcp++;
1878                                 dstp++;
1879                         }
1880                         srcp += srcskip - 1;
1881                         dstp += dstskip;
1882                 } else {
1883                         /* source and destination are aligned */
1884                         int w = width;
1885
1886                         /* first odd pixel? */
1887                         if((uintptr_t)srcp & 2) {
1888                                 Uint16 d = *dstp, s = *srcp;
1889                                 *dstp = BLEND16_50(d, s, mask);
1890                                 srcp++;
1891                                 dstp++;
1892                                 w--;
1893                         }
1894                         /* srcp and dstp are now 32-bit aligned */
1895
1896                         while(w > 1) {
1897                                 Uint32 sw = *(Uint32 *)srcp;
1898                                 Uint32 dw = *(Uint32 *)dstp;
1899                                 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1900                                 srcp += 2;
1901                                 dstp += 2;
1902                                 w -= 2;
1903                         }
1904
1905                         /* last odd pixel? */
1906                         if(w) {
1907                                 Uint16 d = *dstp, s = *srcp;
1908                                 *dstp = BLEND16_50(d, s, mask);
1909                                 srcp++;
1910                                 dstp++;
1911                         }
1912                         srcp += srcskip;
1913                         dstp += dstskip;
1914                 }
1915         }
1916 }
1917
1918 #if GCC_ASMBLIT
1919 /* fast RGB565->RGB565 blending with surface alpha */
1920 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1921 {
1922         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1923         if(alpha == 128) {
1924                 Blit16to16SurfaceAlpha128(info, 0xf7de);
1925         } else {
1926                 int width = info->d_width;
1927                 int height = info->d_height;
1928                 Uint16 *srcp = (Uint16 *)info->s_pixels;
1929                 int srcskip = info->s_skip >> 1;
1930                 Uint16 *dstp = (Uint16 *)info->d_pixels;
1931                 int dstskip = info->d_skip >> 1;
1932                 Uint32 s, d;
1933                 Uint64 load;
1934           
1935                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
1936                 load = alpha;
1937                 alpha >>= 3;            /* downscale alpha to 5 bits */
1938
1939                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1940                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1941                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1942                 /* position alpha to allow for mullo and mulhi on diff channels
1943                    to reduce the number of operations */
1944                 psllq_i2r(3, mm0);
1945           
1946                 /* Setup the 565 color channel masks */
1947                 load = 0x07E007E007E007E0ULL;
1948                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1949                 load = 0x001F001F001F001FULL;
1950                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1951                 while(height--) {
1952                         DUFFS_LOOP_QUATRO2(
1953                         {
1954                                 s = *srcp++;
1955                                 d = *dstp;
1956                                 /*
1957                                  * shift out the middle component (green) to
1958                                  * the high 16 bits, and process all three RGB
1959                                  * components at the same time.
1960                                  */
1961                                 s = (s | s << 16) & 0x07e0f81f;
1962                                 d = (d | d << 16) & 0x07e0f81f;
1963                                 d += (s - d) * alpha >> 5;
1964                                 d &= 0x07e0f81f;
1965                                 *dstp++ = d | d >> 16;
1966                         },{
1967                                 s = *srcp++;
1968                                 d = *dstp;
1969                                 /*
1970                                  * shift out the middle component (green) to
1971                                  * the high 16 bits, and process all three RGB
1972                                  * components at the same time.
1973                                  */
1974                                 s = (s | s << 16) & 0x07e0f81f;
1975                                 d = (d | d << 16) & 0x07e0f81f;
1976                                 d += (s - d) * alpha >> 5;
1977                                 d &= 0x07e0f81f;
1978                                 *dstp++ = d | d >> 16;
1979                                 s = *srcp++;
1980                                 d = *dstp;
1981                                 /*
1982                                  * shift out the middle component (green) to
1983                                  * the high 16 bits, and process all three RGB
1984                                  * components at the same time.
1985                                  */
1986                                 s = (s | s << 16) & 0x07e0f81f;
1987                                 d = (d | d << 16) & 0x07e0f81f;
1988                                 d += (s - d) * alpha >> 5;
1989                                 d &= 0x07e0f81f;
1990                                 *dstp++ = d | d >> 16;
1991                         },{
1992                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1993                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1994
1995                                 /* red -- does not need a mask since the right shift clears
1996                                    the uninteresting bits */
1997                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1998                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1999                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2000                                 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2001
2002                                 /* blend */
2003                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2004                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2005                                 /* alpha used is actually 11 bits
2006                                    11 + 5 = 16 bits, so the sign bits are lost */
2007                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2008                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2009                                 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2010
2011                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2012
2013                                 /* green -- process the bits in place */
2014                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2015                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2016                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2017                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2018
2019                                 /* blend */
2020                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2021                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2022                                 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2023                                    bits are gone and the sign bits present */
2024                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2025                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2026
2027                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2028
2029                                 /* blue */
2030                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2031                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2032                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2033                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2034
2035                                 /* blend */
2036                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2037                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2038                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2039                                    the interesting bits will need to be MASKed */
2040                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2041                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2042                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2043
2044                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2045
2046                                 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2047
2048                                 srcp += 4;
2049                                 dstp += 4;
2050                         }, width);                      
2051                         srcp += srcskip;
2052                         dstp += dstskip;
2053                 }
2054                 emms();
2055         }
2056 }
2057
2058 /* fast RGB555->RGB555 blending with surface alpha */
2059 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2060 {
2061         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2062         if(alpha == 128) {
2063                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2064         } else {
2065                 int width = info->d_width;
2066                 int height = info->d_height;
2067                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2068                 int srcskip = info->s_skip >> 1;
2069                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2070                 int dstskip = info->d_skip >> 1;
2071                 Uint32 s, d;
2072                 Uint64 load;
2073           
2074                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2075                 load = alpha;
2076                 alpha >>= 3;            /* downscale alpha to 5 bits */
2077
2078                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2079                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2080                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2081                 /* position alpha to allow for mullo and mulhi on diff channels
2082                    to reduce the number of operations */
2083                 psllq_i2r(3, mm0);
2084
2085                 /* Setup the 555 color channel masks */
2086                 load = 0x03E003E003E003E0ULL;
2087                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2088                 load = 0x001F001F001F001FULL;
2089                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2090                 while(height--) {
2091                         DUFFS_LOOP_QUATRO2(
2092                         {
2093                                 s = *srcp++;
2094                                 d = *dstp;
2095                                 /*
2096                                  * shift out the middle component (green) to
2097                                  * the high 16 bits, and process all three RGB
2098                                  * components at the same time.
2099                                  */
2100                                 s = (s | s << 16) & 0x03e07c1f;
2101                                 d = (d | d << 16) & 0x03e07c1f;
2102                                 d += (s - d) * alpha >> 5;
2103                                 d &= 0x03e07c1f;
2104                                 *dstp++ = d | d >> 16;
2105                         },{
2106                                 s = *srcp++;
2107                                 d = *dstp;
2108                                 /*
2109                                  * shift out the middle component (green) to
2110                                  * the high 16 bits, and process all three RGB
2111                                  * components at the same time.
2112                                  */
2113                                 s = (s | s << 16) & 0x03e07c1f;
2114                                 d = (d | d << 16) & 0x03e07c1f;
2115                                 d += (s - d) * alpha >> 5;
2116                                 d &= 0x03e07c1f;
2117                                 *dstp++ = d | d >> 16;
2118                                 s = *srcp++;
2119                                 d = *dstp;
2120                                 /*
2121                                  * shift out the middle component (green) to
2122                                  * the high 16 bits, and process all three RGB
2123                                  * components at the same time.
2124                                  */
2125                                 s = (s | s << 16) & 0x03e07c1f;
2126                                 d = (d | d << 16) & 0x03e07c1f;
2127                                 d += (s - d) * alpha >> 5;
2128                                 d &= 0x03e07c1f;
2129                                 *dstp++ = d | d >> 16;
2130                         },{
2131                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2132                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2133
2134                                 /* red -- process the bits in place */
2135                                 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2136                                         /* by reusing the GREEN mask we free up another mmx
2137                                            register to accumulate the result */
2138
2139                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2140                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2141                                 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2142                                 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2143
2144                                 /* blend */
2145                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2146                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2147                                 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2148                                    cleared by a MASK below */
2149                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2150                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2151                                 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2152
2153                                 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2154
2155                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2156
2157                                 /* green -- process the bits in place */
2158                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2159                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2160                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2161                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2162
2163                                 /* blend */
2164                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2165                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2166                                 /* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2167                                    bits are gone and the sign bits present */
2168                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2169                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2170
2171                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2172
2173                                 /* blue */
2174                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2175                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2176                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2177                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2178
2179                                 /* blend */
2180                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2181                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2182                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2183                                    the interesting bits will need to be MASKed */
2184                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2185                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2186                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2187
2188                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2189
2190                                 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2191
2192                                 srcp += 4;
2193                                 dstp += 4;
2194                         }, width);                      
2195                         srcp += srcskip;
2196                         dstp += dstskip;
2197                 }
2198                 emms();
2199         }
2200 }
2201 /* End GCC_ASMBLIT */
2202
2203 #elif MSVC_ASMBLIT
2204 /* fast RGB565->RGB565 blending with surface alpha */
2205 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2206 {
2207         unsigned alpha = info->src->alpha;
2208         if(alpha == 128) {
2209                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2210         } else {
2211                 int width = info->d_width;
2212                 int height = info->d_height;
2213                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2214                 int srcskip = info->s_skip >> 1;
2215                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2216                 int dstskip = info->d_skip >> 1;
2217                 Uint32 s, d;
2218           
2219                 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2220
2221                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2222                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2223                 alpha >>= 3;            /* downscale alpha to 5 bits */
2224
2225                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2226                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2227                 /* position alpha to allow for mullo and mulhi on diff channels
2228                    to reduce the number of operations */
2229                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2230           
2231                 /* Setup the 565 color channel masks */
2232                 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2233                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2234                 
2235                 while(height--) {
2236                         DUFFS_LOOP_QUATRO2(
2237                         {
2238                                 s = *srcp++;
2239                                 d = *dstp;
2240                                 /*
2241                                  * shift out the middle component (green) to
2242                                  * the high 16 bits, and process all three RGB
2243                                  * components at the same time.
2244                                  */
2245                                 s = (s | s << 16) & 0x07e0f81f;
2246                                 d = (d | d << 16) & 0x07e0f81f;
2247                                 d += (s - d) * alpha >> 5;
2248                                 d &= 0x07e0f81f;
2249                                 *dstp++ = (Uint16)(d | d >> 16);
2250                         },{
2251                                 s = *srcp++;
2252                                 d = *dstp;
2253                                 /*
2254                                  * shift out the middle component (green) to
2255                                  * the high 16 bits, and process all three RGB
2256                                  * components at the same time.
2257                                  */
2258                                 s = (s | s << 16) & 0x07e0f81f;
2259                                 d = (d | d << 16) & 0x07e0f81f;
2260                                 d += (s - d) * alpha >> 5;
2261                                 d &= 0x07e0f81f;
2262                                 *dstp++ = (Uint16)(d | d >> 16);
2263                                 s = *srcp++;
2264                                 d = *dstp;
2265                                 /*
2266                                  * shift out the middle component (green) to
2267                                  * the high 16 bits, and process all three RGB
2268                                  * components at the same time.
2269                                  */
2270                                 s = (s | s << 16) & 0x07e0f81f;
2271                                 d = (d | d << 16) & 0x07e0f81f;
2272                                 d += (s - d) * alpha >> 5;
2273                                 d &= 0x07e0f81f;
2274                                 *dstp++ = (Uint16)(d | d >> 16);
2275                         },{
2276                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2277                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2278
2279                                 /* red */
2280                                 src2 = src1;
2281                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2282
2283                                 dst2 = dst1;
2284                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2285
2286                                 /* blend */
2287                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2288                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2289                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2290                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2291                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2292
2293                                 mm_res = dst2; /* RED -> mm_res */
2294
2295                                 /* green -- process the bits in place */
2296                                 src2 = src1;
2297                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2298
2299                                 dst2 = dst1;
2300                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2301
2302                                 /* blend */
2303                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2304                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2305                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2306                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2307
2308                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2309
2310                                 /* blue */
2311                                 src2 = src1;
2312                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2313
2314                                 dst2 = dst1;
2315                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2316
2317                                 /* blend */
2318                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2319                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2320                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2321                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2322                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2323
2324                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2325
2326                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2327
2328                                 srcp += 4;
2329                                 dstp += 4;
2330                         }, width);                      
2331                         srcp += srcskip;
2332                         dstp += dstskip;
2333                 }
2334                 _mm_empty();
2335         }
2336 }
2337
2338 /* fast RGB555->RGB555 blending with surface alpha */
2339 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2340 {
2341         unsigned alpha = info->src->alpha;
2342         if(alpha == 128) {
2343                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2344         } else {
2345                 int width = info->d_width;
2346                 int height = info->d_height;
2347                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2348                 int srcskip = info->s_skip >> 1;
2349                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2350                 int dstskip = info->d_skip >> 1;
2351                 Uint32 s, d;
2352           
2353                 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2354
2355                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2356                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2357                 alpha >>= 3;            /* downscale alpha to 5 bits */
2358
2359                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2360                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2361                 /* position alpha to allow for mullo and mulhi on diff channels
2362                    to reduce the number of operations */
2363                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2364           
2365                 /* Setup the 555 color channel masks */
2366                 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2367                 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2368                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2369
2370                 while(height--) {
2371                         DUFFS_LOOP_QUATRO2(
2372                         {
2373                                 s = *srcp++;
2374                                 d = *dstp;
2375                                 /*
2376                                  * shift out the middle component (green) to
2377                                  * the high 16 bits, and process all three RGB
2378                                  * components at the same time.
2379                                  */
2380                                 s = (s | s << 16) & 0x03e07c1f;
2381                                 d = (d | d << 16) & 0x03e07c1f;
2382                                 d += (s - d) * alpha >> 5;
2383                                 d &= 0x03e07c1f;
2384                                 *dstp++ = (Uint16)(d | d >> 16);
2385                         },{
2386                                 s = *srcp++;
2387                                 d = *dstp;
2388                                 /*
2389                                  * shift out the middle component (green) to
2390                                  * the high 16 bits, and process all three RGB
2391                                  * components at the same time.
2392                                  */
2393                                 s = (s | s << 16) & 0x03e07c1f;
2394                                 d = (d | d << 16) & 0x03e07c1f;
2395                                 d += (s - d) * alpha >> 5;
2396                                 d &= 0x03e07c1f;
2397                                 *dstp++ = (Uint16)(d | d >> 16);
2398                                 s = *srcp++;
2399                                 d = *dstp;
2400                                 /*
2401                                  * shift out the middle component (green) to
2402                                  * the high 16 bits, and process all three RGB
2403                                  * components at the same time.
2404                                  */
2405                                 s = (s | s << 16) & 0x03e07c1f;
2406                                 d = (d | d << 16) & 0x03e07c1f;
2407                                 d += (s - d) * alpha >> 5;
2408                                 d &= 0x03e07c1f;
2409                                 *dstp++ = (Uint16)(d | d >> 16);
2410                         },{
2411                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2412                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2413
2414                                 /* red -- process the bits in place */
2415                                 src2 = src1;
2416                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2417
2418                                 dst2 = dst1;
2419                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2420
2421                                 /* blend */
2422                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2423                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2424                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2425                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2426                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2427
2428                                 mm_res = dst2; /* RED -> mm_res */
2429                                 
2430                                 /* green -- process the bits in place */
2431                                 src2 = src1;
2432                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2433
2434                                 dst2 = dst1;
2435                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2436
2437                                 /* blend */
2438                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2439                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2440                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2441                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2442
2443                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2444
2445                                 /* blue */
2446                                 src2 = src1; /* src -> src2 */
2447                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2448
2449                                 dst2 = dst1; /* dst -> dst2 */
2450                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2451
2452                                 /* blend */
2453                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2454                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2455                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2456                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2457                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2458
2459                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2460
2461                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2462
2463                                 srcp += 4;
2464                                 dstp += 4;
2465                         }, width);                      
2466                         srcp += srcskip;
2467                         dstp += dstskip;
2468                 }
2469                 _mm_empty();
2470         }
2471 }
2472 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2473
2474 /* fast RGB565->RGB565 blending with surface alpha */
2475 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2476 {
2477         unsigned alpha = info->src->alpha;
2478         if(alpha == 128) {
2479                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2480         } else {
2481                 int width = info->d_width;
2482                 int height = info->d_height;
2483                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2484                 int srcskip = info->s_skip >> 1;
2485                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2486                 int dstskip = info->d_skip >> 1;
2487                 alpha >>= 3;    /* downscale alpha to 5 bits */
2488
2489                 while(height--) {
2490                         DUFFS_LOOP4({
2491                                 Uint32 s = *srcp++;
2492                                 Uint32 d = *dstp;
2493                                 /*
2494                                  * shift out the middle component (green) to
2495                                  * the high 16 bits, and process all three RGB
2496                                  * components at the same time.
2497                                  */
2498                                 s = (s | s << 16) & 0x07e0f81f;
2499                                 d = (d | d << 16) & 0x07e0f81f;
2500                                 d += (s - d) * alpha >> 5;
2501                                 d &= 0x07e0f81f;
2502                                 *dstp++ = (Uint16)(d | d >> 16);
2503                         }, width);
2504                         srcp += srcskip;
2505                         dstp += dstskip;
2506                 }
2507         }
2508 }
2509
2510 /* fast RGB555->RGB555 blending with surface alpha */
2511 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2512 {
2513         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2514         if(alpha == 128) {
2515                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2516         } else {
2517                 int width = info->d_width;
2518                 int height = info->d_height;
2519                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2520                 int srcskip = info->s_skip >> 1;
2521                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2522                 int dstskip = info->d_skip >> 1;
2523                 alpha >>= 3;            /* downscale alpha to 5 bits */
2524
2525                 while(height--) {
2526                         DUFFS_LOOP4({
2527                                 Uint32 s = *srcp++;
2528                                 Uint32 d = *dstp;
2529                                 /*
2530                                  * shift out the middle component (green) to
2531                                  * the high 16 bits, and process all three RGB
2532                                  * components at the same time.
2533                                  */
2534                                 s = (s | s << 16) & 0x03e07c1f;
2535                                 d = (d | d << 16) & 0x03e07c1f;
2536                                 d += (s - d) * alpha >> 5;
2537                                 d &= 0x03e07c1f;
2538                                 *dstp++ = (Uint16)(d | d >> 16);
2539                         }, width);
2540                         srcp += srcskip;
2541                         dstp += dstskip;
2542                 }
2543         }
2544 }
2545
2546 /* fast ARGB8888->RGB565 blending with pixel alpha */
2547 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2548 {
2549         int width = info->d_width;
2550         int height = info->d_height;
2551         Uint32 *srcp = (Uint32 *)info->s_pixels;
2552         int srcskip = info->s_skip >> 2;
2553         Uint16 *dstp = (Uint16 *)info->d_pixels;
2554         int dstskip = info->d_skip >> 1;
2555
2556         while(height--) {
2557             DUFFS_LOOP4({
2558                 Uint32 s = *srcp;
2559                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2560                 /* FIXME: Here we special-case opaque alpha since the
2561                    compositioning used (>>8 instead of /255) doesn't handle
2562                    it correctly. Also special-case alpha=0 for speed?
2563                    Benchmark this! */
2564                 if(alpha) {   
2565                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2566                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2567                   } else {
2568                     Uint32 d = *dstp;
2569                     /*
2570                      * convert source and destination to G0RAB65565
2571                      * and blend all components at the same time
2572                      */
2573                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2574                       + (s >> 3 & 0x1f);
2575                     d = (d | d << 16) & 0x07e0f81f;
2576                     d += (s - d) * alpha >> 5;
2577                     d &= 0x07e0f81f;
2578                     *dstp = (Uint16)(d | d >> 16);
2579                   }
2580                 }
2581                 srcp++;
2582                 dstp++;
2583             }, width);
2584             srcp += srcskip;
2585             dstp += dstskip;
2586         }
2587 }
2588
2589 /* fast ARGB8888->RGB555 blending with pixel alpha */
2590 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2591 {
2592         int width = info->d_width;
2593         int height = info->d_height;
2594         Uint32 *srcp = (Uint32 *)info->s_pixels;
2595         int srcskip = info->s_skip >> 2;
2596         Uint16 *dstp = (Uint16 *)info->d_pixels;
2597         int dstskip = info->d_skip >> 1;
2598
2599         while(height--) {
2600             DUFFS_LOOP4({
2601                 unsigned alpha;
2602                 Uint32 s = *srcp;
2603                 alpha = s >> 27; /* downscale alpha to 5 bits */
2604                 /* FIXME: Here we special-case opaque alpha since the
2605                    compositioning used (>>8 instead of /255) doesn't handle
2606                    it correctly. Also special-case alpha=0 for speed?
2607                    Benchmark this! */
2608                 if(alpha) {   
2609                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2610                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2611                   } else {
2612                     Uint32 d = *dstp;
2613                     /*
2614                      * convert source and destination to G0RAB65565
2615                      * and blend all components at the same time
2616                      */
2617                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2618                       + (s >> 3 & 0x1f);
2619                     d = (d | d << 16) & 0x03e07c1f;
2620                     d += (s - d) * alpha >> 5;
2621                     d &= 0x03e07c1f;
2622                     *dstp = (Uint16)(d | d >> 16);
2623                   }
2624                 }
2625                 srcp++;
2626                 dstp++;
2627             }, width);
2628             srcp += srcskip;
2629             dstp += dstskip;
2630         }
2631 }
2632
2633 /* General (slow) N->N blending with per-surface alpha */
2634 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2635 {
2636         int width = info->d_width;
2637         int height = info->d_height;
2638         Uint8 *src = info->s_pixels;
2639         int srcskip = info->s_skip;
2640         Uint8 *dst = info->d_pixels;
2641         int dstskip = info->d_skip;
2642         SDL_PixelFormat *srcfmt = info->src;
2643         SDL_PixelFormat *dstfmt = info->dst;
2644         int srcbpp = srcfmt->BytesPerPixel;
2645         int dstbpp = dstfmt->BytesPerPixel;
2646         unsigned sA = srcfmt->alpha;
2647         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2648
2649         if(sA) {
2650           while ( height-- ) {
2651             DUFFS_LOOP4(
2652             {
2653                 Uint32 Pixel;
2654                 unsigned sR;
2655                 unsigned sG;
2656                 unsigned sB;
2657                 unsigned dR;
2658                 unsigned dG;
2659                 unsigned dB;
2660                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2661                 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2662                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2663                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2664                 src += srcbpp;
2665                 dst += dstbpp;
2666             },
2667             width);
2668             src += srcskip;
2669             dst += dstskip;
2670           }
2671         }
2672 }
2673
2674 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2675 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2676 {
2677         int width = info->d_width;
2678         int height = info->d_height;
2679         Uint8 *src = info->s_pixels;
2680         int srcskip = info->s_skip;
2681         Uint8 *dst = info->d_pixels;
2682         int dstskip = info->d_skip;
2683         SDL_PixelFormat *srcfmt = info->src;
2684         SDL_PixelFormat *dstfmt = info->dst;
2685         Uint32 ckey = srcfmt->colorkey;
2686         int srcbpp = srcfmt->BytesPerPixel;
2687         int dstbpp = dstfmt->BytesPerPixel;
2688         unsigned sA = srcfmt->alpha;
2689         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2690
2691         if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2692             Uint16 *src16 = (Uint16 *)src;
2693             Uint16 *dst16 = (Uint16 *)dst;
2694             sA >>= 3;   /* downscale alpha to 5 bits */
2695             while ( height-- ) {
2696                 DUFFS_LOOP4(
2697                 {
2698                     Uint32 s;
2699                     Uint32 d;
2700                     s = *src16;
2701                     if(sA && s != ckey) {
2702                         d = *dst16;
2703                         s = (s | s << 16) & 0x07e0f81f;
2704                         d = (d | d << 16) & 0x07e0f81f;
2705                         d += (s - d) * sA >> 5;
2706                         d &= 0x07e0f81f;
2707                         *dst16 = (Uint16)(d | d >> 16);
2708                     }
2709                     src16++;
2710                     dst16++;
2711                 },
2712                 width);
2713                 src16 += srcskip / 2;
2714                 dst16 += dstskip / 2;
2715             }
2716             return;
2717         }
2718
2719         while ( height-- ) {
2720             DUFFS_LOOP4(
2721             {
2722                 Uint32 Pixel;
2723                 unsigned sR;
2724                 unsigned sG;
2725                 unsigned sB;
2726                 unsigned dR;
2727                 unsigned dG;
2728                 unsigned dB;
2729                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2730                 if(sA && Pixel != ckey) {
2731                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2732                     DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2733                     ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2734                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2735                 }
2736                 src += srcbpp;
2737                 dst += dstbpp;
2738             },
2739             width);
2740             src += srcskip;
2741             dst += dstskip;
2742         }
2743 }
2744
2745 /* General (slow) N->N blending with pixel alpha */
2746 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2747 {
2748         int width = info->d_width;
2749         int height = info->d_height;
2750         Uint8 *src = info->s_pixels;
2751         int srcskip = info->s_skip;
2752         Uint8 *dst = info->d_pixels;
2753         int dstskip = info->d_skip;
2754         SDL_PixelFormat *srcfmt = info->src;
2755         SDL_PixelFormat *dstfmt = info->dst;
2756
2757         int  srcbpp;
2758         int  dstbpp;
2759
2760         /* Set up some basic variables */
2761         srcbpp = srcfmt->BytesPerPixel;
2762         dstbpp = dstfmt->BytesPerPixel;
2763
2764         /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2765            quite right. for <8bpp source alpha, it gets them very wrong
2766            (check all macros!)
2767            It is unclear whether there is a good general solution that doesn't
2768            need a branch (or a divide). */
2769         while ( height-- ) {
2770             DUFFS_LOOP4(
2771             {
2772                 Uint32 Pixel;
2773                 unsigned sR;
2774                 unsigned sG;
2775                 unsigned sB;
2776                 unsigned dR;
2777                 unsigned dG;
2778                 unsigned dB;
2779                 unsigned sA;
2780                 unsigned dA;
2781                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2782                 if(sA) {
2783                   DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2784                   ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2785                   ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2786                 }
2787                 src += srcbpp;
2788                 dst += dstbpp;
2789             },
2790             width);
2791             src += srcskip;
2792             dst += dstskip;
2793         }
2794 }
2795
2796
2797 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2798 {
2799     SDL_PixelFormat *sf = surface->format;
2800     SDL_PixelFormat *df = surface->map->dst->format;
2801
2802     if(sf->Amask == 0) {
2803         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2804             if(df->BytesPerPixel == 1)
2805                 return BlitNto1SurfaceAlphaKey;
2806             else
2807 #if SDL_ALTIVEC_BLITTERS
2808         if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2809             !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2810             return Blit32to32SurfaceAlphaKeyAltivec;
2811         else
2812 #endif
2813             return BlitNtoNSurfaceAlphaKey;
2814         } else {
2815             /* Per-surface alpha blits */
2816             switch(df->BytesPerPixel) {
2817             case 1:
2818                 return BlitNto1SurfaceAlpha;
2819
2820             case 2:
2821                 if(surface->map->identity) {
2822                     if(df->Gmask == 0x7e0)
2823                     {
2824 #if MMX_ASMBLIT
2825                 if(SDL_HasMMX())
2826                         return Blit565to565SurfaceAlphaMMX;
2827                 else
2828 #endif
2829                         return Blit565to565SurfaceAlpha;
2830                     }
2831                     else if(df->Gmask == 0x3e0)
2832                     {
2833 #if MMX_ASMBLIT
2834                 if(SDL_HasMMX())
2835                         return Blit555to555SurfaceAlphaMMX;
2836                 else
2837 #endif
2838                         return Blit555to555SurfaceAlpha;
2839                     }
2840                 }
2841                 return BlitNtoNSurfaceAlpha;
2842
2843             case 4:
2844                 if(sf->Rmask == df->Rmask
2845                    && sf->Gmask == df->Gmask
2846                    && sf->Bmask == df->Bmask
2847                    && sf->BytesPerPixel == 4)
2848                 {
2849 #if MMX_ASMBLIT
2850                         if(sf->Rshift % 8 == 0
2851                            && sf->Gshift % 8 == 0
2852                            && sf->Bshift % 8 == 0
2853                            && SDL_HasMMX())
2854                             return BlitRGBtoRGBSurfaceAlphaMMX;
2855 #endif
2856 #ifdef __ARM_NEON__
2857                         if(sf->Rshift % 8 == 0
2858                            && sf->Gshift % 8 == 0
2859                            && sf->Bshift % 8 == 0)
2860                         {
2861                                 return BlitARGBtoXRGBalphaS_neon;
2862                         }
2863 #endif
2864                         if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2865                         {
2866 #if SDL_ALTIVEC_BLITTERS
2867                                 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2868                                         && SDL_HasAltiVec())
2869                                         return BlitRGBtoRGBSurfaceAlphaAltivec;
2870 #endif
2871                                 return BlitRGBtoRGBSurfaceAlpha;
2872                         }
2873                 }
2874 #ifdef __ARM_NEON__
2875                 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2876                     && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2877                 {
2878                         return BlitABGRtoXRGBalphaS_neon;
2879                 }
2880 #endif
2881 #if SDL_ALTIVEC_BLITTERS
2882                 if((sf->BytesPerPixel == 4) &&
2883                    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2884                         return Blit32to32SurfaceAlphaAltivec;
2885                 else
2886 #endif
2887                         return BlitNtoNSurfaceAlpha;
2888
2889             case 3:
2890             default:
2891                 return BlitNtoNSurfaceAlpha;
2892             }
2893         }
2894     } else {
2895         /* Per-pixel alpha blits */
2896         switch(df->BytesPerPixel) {
2897         case 1:
2898             return BlitNto1PixelAlpha;
2899
2900         case 2:
2901 #if SDL_ALTIVEC_BLITTERS
2902         if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2903            df->Gmask == 0x7e0 &&
2904            df->Bmask == 0x1f && SDL_HasAltiVec())
2905             return Blit32to565PixelAlphaAltivec;
2906         else
2907 #endif
2908             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2909                && sf->Gmask == 0xff00
2910                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2911                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2912                 if(df->Gmask == 0x7e0)
2913                     return BlitARGBto565PixelAlpha;
2914                 else if(df->Gmask == 0x3e0)
2915                     return BlitARGBto555PixelAlpha;
2916             }
2917             return BlitNtoNPixelAlpha;
2918
2919         case 4:
2920             if(sf->Rmask == df->Rmask
2921                && sf->Gmask == df->Gmask
2922                && sf->Bmask == df->Bmask
2923                && sf->BytesPerPixel == 4)
2924             {
2925 #if MMX_ASMBLIT
2926                 if(sf->Rshift % 8 == 0
2927                    && sf->Gshift % 8 == 0
2928                    && sf->Bshift % 8 == 0
2929                    && sf->Ashift % 8 == 0
2930                    && sf->Aloss == 0)
2931                 {
2932                         if(SDL_Has3DNow())
2933                                 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2934                         if(SDL_HasMMX())
2935                                 return BlitRGBtoRGBPixelAlphaMMX;
2936                 }
2937 #endif
2938 #ifdef __ARM_NEON__
2939                 if(sf->Rshift % 8 == 0
2940                    && sf->Gshift % 8 == 0
2941                    && sf->Bshift % 8 == 0
2942                    && sf->Ashift % 8 == 0)
2943                 {
2944                         return BlitARGBtoXRGBalpha_neon;
2945                 }
2946 #endif
2947                 if(sf->Amask == 0xff000000)
2948                 {
2949 #if SDL_ALTIVEC_BLITTERS
2950                         if(!(surface->map->dst->flags & SDL_HWSURFACE)
2951                                 && SDL_HasAltiVec())
2952                                 return BlitRGBtoRGBPixelAlphaAltivec;
2953 #endif
2954                         return BlitRGBtoRGBPixelAlpha;
2955                 }
2956             }
2957 #ifdef __ARM_NEON__
2958             if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2959                 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2960                 && sf->Amask == 0xff000000)
2961             {
2962                 return BlitABGRtoXRGBalpha_neon;
2963             }
2964 #endif
2965 #if SDL_ALTIVEC_BLITTERS
2966             if (sf->Amask && sf->BytesPerPixel == 4 &&
2967                 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2968                 return Blit32to32PixelAlphaAltivec;
2969             else
2970 #endif
2971                 return BlitNtoNPixelAlpha;
2972
2973         case 3:
2974         default:
2975             return BlitNtoNPixelAlpha;
2976         }
2977     }
2978 }
2979