tune the preloads a bit
[sdl_omap.git] / src / video / SDL_blit_A.c
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2009 Sam Lantinga
4
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 #include "SDL_video.h"
25 #include "SDL_blit.h"
26
27 /*
28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29    Checking if _mm_free is #defined in malloc.h is is the only way to
30    determine if the Processor Pack is installed, as far as I can tell.
31 */
32
33 #if SDL_ASSEMBLY_ROUTINES
34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 #    define MMX_ASMBLIT 1
36 #    define GCC_ASMBLIT 1
37 #  elif defined(_MSC_VER) && defined(_M_IX86)
38 #    if (_MSC_VER <= 1200)  
39 #      include <malloc.h>   
40 #      if defined(_mm_free)
41 #          define HAVE_MMINTRIN_H 1
42 #      endif
43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
44 #      define HAVE_MMINTRIN_H 1
45 #    endif
46 #    if HAVE_MMINTRIN_H
47 #      define MMX_ASMBLIT 1
48 #      define MSVC_ASMBLIT 1
49 #    endif
50 #  endif
51 #endif /* SDL_ASSEMBLY_ROUTINES */
52
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
55 #if GCC_ASMBLIT
56 #include "mmx.h"
57 #elif MSVC_ASMBLIT
58 #include <mmintrin.h>
59 #include <mm3dnow.h>
60 #endif
61
62 /* Functions to perform alpha blended blitting */
63
64 #ifdef __ARM_NEON__
65
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
70 { \
71         int width = info->d_width; \
72         int height = info->d_height; \
73         Uint8 *src = info->s_pixels; \
74         Uint8 *dst = info->d_pixels; \
75         int dstBpp = info->dst->BytesPerPixel; \
76         int srcstride = width * 4 + info->s_skip; \
77         int dststride = width * dstBpp + info->d_skip; \
78 \
79         while ( height-- ) { \
80             __builtin_prefetch(dst + dststride); \
81             neon_name(dst, src, width); \
82             src += srcstride; \
83             dst += dststride; \
84         } \
85 }
86
87 #define make_neon_callerS(name, neon_name) \
88 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
89 static void name(SDL_BlitInfo *info) \
90 { \
91         int width = info->d_width; \
92         int height = info->d_height; \
93         Uint8 *src = info->s_pixels; \
94         Uint8 *dst = info->d_pixels; \
95         int srcskip = info->s_skip; \
96         int dstskip = info->d_skip; \
97         unsigned alpha = info->src->alpha;\
98 \
99         while ( height-- ) { \
100             neon_name(dst, src, width, alpha); \
101             src += width * 4 + srcskip; \
102             dst += width * 4 + dstskip; \
103         } \
104 }
105
106 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
107 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
108 make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
109 make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
110 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
111 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
112
113 #endif /* __ARM_NEON__ */
114
115 /* N->1 blending with per-surface alpha */
116 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
117 {
118         int width = info->d_width;
119         int height = info->d_height;
120         Uint8 *src = info->s_pixels;
121         int srcskip = info->s_skip;
122         Uint8 *dst = info->d_pixels;
123         int dstskip = info->d_skip;
124         Uint8 *palmap = info->table;
125         SDL_PixelFormat *srcfmt = info->src;
126         SDL_PixelFormat *dstfmt = info->dst;
127         int srcbpp = srcfmt->BytesPerPixel;
128
129         const unsigned A = srcfmt->alpha;
130
131         while ( height-- ) {
132             DUFFS_LOOP4(
133             {
134                 Uint32 Pixel;
135                 unsigned sR;
136                 unsigned sG;
137                 unsigned sB;
138                 unsigned dR;
139                 unsigned dG;
140                 unsigned dB;
141                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
142                 dR = dstfmt->palette->colors[*dst].r;
143                 dG = dstfmt->palette->colors[*dst].g;
144                 dB = dstfmt->palette->colors[*dst].b;
145                 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
146                 dR &= 0xff;
147                 dG &= 0xff;
148                 dB &= 0xff;
149                 /* Pack RGB into 8bit pixel */
150                 if ( palmap == NULL ) {
151                     *dst =((dR>>5)<<(3+2))|
152                           ((dG>>5)<<(2))|
153                           ((dB>>6)<<(0));
154                 } else {
155                     *dst = palmap[((dR>>5)<<(3+2))|
156                                   ((dG>>5)<<(2))  |
157                                   ((dB>>6)<<(0))];
158                 }
159                 dst++;
160                 src += srcbpp;
161             },
162             width);
163             src += srcskip;
164             dst += dstskip;
165         }
166 }
167
168 /* N->1 blending with pixel alpha */
169 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
170 {
171         int width = info->d_width;
172         int height = info->d_height;
173         Uint8 *src = info->s_pixels;
174         int srcskip = info->s_skip;
175         Uint8 *dst = info->d_pixels;
176         int dstskip = info->d_skip;
177         Uint8 *palmap = info->table;
178         SDL_PixelFormat *srcfmt = info->src;
179         SDL_PixelFormat *dstfmt = info->dst;
180         int srcbpp = srcfmt->BytesPerPixel;
181
182         /* FIXME: fix alpha bit field expansion here too? */
183         while ( height-- ) {
184             DUFFS_LOOP4(
185             {
186                 Uint32 Pixel;
187                 unsigned sR;
188                 unsigned sG;
189                 unsigned sB;
190                 unsigned sA;
191                 unsigned dR;
192                 unsigned dG;
193                 unsigned dB;
194                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
195                 dR = dstfmt->palette->colors[*dst].r;
196                 dG = dstfmt->palette->colors[*dst].g;
197                 dB = dstfmt->palette->colors[*dst].b;
198                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
199                 dR &= 0xff;
200                 dG &= 0xff;
201                 dB &= 0xff;
202                 /* Pack RGB into 8bit pixel */
203                 if ( palmap == NULL ) {
204                     *dst =((dR>>5)<<(3+2))|
205                           ((dG>>5)<<(2))|
206                           ((dB>>6)<<(0));
207                 } else {
208                     *dst = palmap[((dR>>5)<<(3+2))|
209                                   ((dG>>5)<<(2))  |
210                                   ((dB>>6)<<(0))  ];
211                 }
212                 dst++;
213                 src += srcbpp;
214             },
215             width);
216             src += srcskip;
217             dst += dstskip;
218         }
219 }
220
221 /* colorkeyed N->1 blending with per-surface alpha */
222 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
223 {
224         int width = info->d_width;
225         int height = info->d_height;
226         Uint8 *src = info->s_pixels;
227         int srcskip = info->s_skip;
228         Uint8 *dst = info->d_pixels;
229         int dstskip = info->d_skip;
230         Uint8 *palmap = info->table;
231         SDL_PixelFormat *srcfmt = info->src;
232         SDL_PixelFormat *dstfmt = info->dst;
233         int srcbpp = srcfmt->BytesPerPixel;
234         Uint32 ckey = srcfmt->colorkey;
235
236         const int A = srcfmt->alpha;
237
238         while ( height-- ) {
239             DUFFS_LOOP(
240             {
241                 Uint32 Pixel;
242                 unsigned sR;
243                 unsigned sG;
244                 unsigned sB;
245                 unsigned dR;
246                 unsigned dG;
247                 unsigned dB;
248                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
249                 if ( Pixel != ckey ) {
250                     dR = dstfmt->palette->colors[*dst].r;
251                     dG = dstfmt->palette->colors[*dst].g;
252                     dB = dstfmt->palette->colors[*dst].b;
253                     ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
254                     dR &= 0xff;
255                     dG &= 0xff;
256                     dB &= 0xff;
257                     /* Pack RGB into 8bit pixel */
258                     if ( palmap == NULL ) {
259                         *dst =((dR>>5)<<(3+2))|
260                               ((dG>>5)<<(2)) |
261                               ((dB>>6)<<(0));
262                     } else {
263                         *dst = palmap[((dR>>5)<<(3+2))|
264                                       ((dG>>5)<<(2))  |
265                                       ((dB>>6)<<(0))  ];
266                     }
267                 }
268                 dst++;
269                 src += srcbpp;
270             },
271             width);
272             src += srcskip;
273             dst += dstskip;
274         }
275 }
276
277 #if GCC_ASMBLIT
278 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
279 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
280 {
281         int width = info->d_width;
282         int height = info->d_height;
283         Uint32 *srcp = (Uint32 *)info->s_pixels;
284         int srcskip = info->s_skip >> 2;
285         Uint32 *dstp = (Uint32 *)info->d_pixels;
286         int dstskip = info->d_skip >> 2;
287         Uint32 dalpha = info->dst->Amask;
288         Uint64 load;
289
290         load = 0x00fefefe00fefefeULL;/* alpha128 mask */
291         movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
292         load = 0x0001010100010101ULL;/* !alpha128 mask */
293         movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
294         movd_m2r(dalpha, mm7); /* dst alpha mask */
295         punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
296         while(height--) {
297                 DUFFS_LOOP_DOUBLE2(
298                 {
299                         Uint32 s = *srcp++;
300                         Uint32 d = *dstp;
301                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
302                                    + (s & d & 0x00010101)) | dalpha;
303                 },{
304                         movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
305                         movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
306
307                         movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
308                         movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
309
310                         pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
311                         pand_r2r(mm4, mm5); /* src & mask -> mm5 */
312                         paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
313                         pand_r2r(mm1, mm2); /* src & dst -> mm2 */
314                         psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
315                         pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
316                         paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
317                         
318                         por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
319                         movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
320                         dstp += 2;
321                         srcp += 2;
322                 }, width);
323                 srcp += srcskip;
324                 dstp += dstskip;
325         }
326         emms();
327 }
328
329 /* fast RGB888->(A)RGB888 blending with surface alpha */
330 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
331 {
332         SDL_PixelFormat* df = info->dst;
333         unsigned alpha = info->src->alpha;
334
335         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
336                         /* only call a128 version when R,G,B occupy lower bits */
337                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
338         } else {
339                 int width = info->d_width;
340                 int height = info->d_height;
341                 Uint32 *srcp = (Uint32 *)info->s_pixels;
342                 int srcskip = info->s_skip >> 2;
343                 Uint32 *dstp = (Uint32 *)info->d_pixels;
344                 int dstskip = info->d_skip >> 2;
345
346                 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
347                 /* form the alpha mult */
348                 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
349                 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
350                 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
351                 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
352                 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
353                 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
354                 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
355                         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
356                 movd_m2r(df->Amask, mm7); /* dst alpha mask */
357                 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
358                 
359                 while(height--) {
360                         DUFFS_LOOP_DOUBLE2({
361                                 /* One Pixel Blend */
362                                 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
363                                 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
364                                 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
365                                 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
366
367                                 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
368                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
369                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
370                                 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
371
372                                 packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
373                                 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
374                                 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
375                                 ++srcp;
376                                 ++dstp;
377                         },{
378                                 /* Two Pixels Blend */
379                                 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
380                                 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
381                                 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
382                                 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
383
384                                 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
385                                 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
386                                 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
387                                 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
388
389                                 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
390                                 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
391                                 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
392                                 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
393
394                                 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
395                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
396                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
397                                 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
398
399                                 packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
400                                 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
401                                 
402                                 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
403
404                                 srcp += 2;
405                                 dstp += 2;
406                         }, width);
407                         srcp += srcskip;
408                         dstp += dstskip;
409                 }
410                 emms();
411         }
412 }
413
414 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
415 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
416 {
417         int width = info->d_width;
418         int height = info->d_height;
419         Uint32 *srcp = (Uint32 *)info->s_pixels;
420         int srcskip = info->s_skip >> 2;
421         Uint32 *dstp = (Uint32 *)info->d_pixels;
422         int dstskip = info->d_skip >> 2;
423         SDL_PixelFormat* sf = info->src;
424         Uint32 amask = sf->Amask;
425
426         pxor_r2r(mm6, mm6); /* 0 -> mm6 */
427         /* form multiplication mask */
428         movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
429         punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
430         pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
431         movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
432         pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
433         /* form channel masks */
434         movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
435         packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
436         packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
437         pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
438         /* get alpha channel shift */
439         __asm__ __volatile__ (
440                 "movd %0, %%mm5"
441                 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
442
443         while(height--) {
444             DUFFS_LOOP4({
445                 Uint32 alpha = *srcp & amask;
446                 /* FIXME: Here we special-case opaque alpha since the
447                         compositioning used (>>8 instead of /255) doesn't handle
448                         it correctly. Also special-case alpha=0 for speed?
449                         Benchmark this! */
450                 if(alpha == 0) {
451                         /* do nothing */
452                 } else if(alpha == amask) {
453                         /* opaque alpha -- copy RGB, keep dst alpha */
454                         /* using MMX here to free up regular registers for other things */
455                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
456                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
457                         pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
458                         pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
459                         por_r2r(mm1, mm2); /* src | dst -> mm2 */
460                         movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
461                 } else {
462                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
463                         punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
464
465                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
466                         punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
467
468                         __asm__ __volatile__ (
469                                 "movd %0, %%mm4"
470                                 : : "r" (alpha) ); /* 0000A000 -> mm4 */
471                         psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
472                         punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
473                         punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
474                         pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
475
476                         /* blend */                 
477                         psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
478                         pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
479                         psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
480                         paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
481                         
482                         packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
483                         movd_r2m(mm2, *dstp);/* mm2 -> dst */
484                 }
485                 ++srcp;
486                 ++dstp;
487             }, width);
488             srcp += srcskip;
489             dstp += dstskip;
490         }
491         emms();
492 }
493 /* End GCC_ASMBLIT */
494
495 #elif MSVC_ASMBLIT
496 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
497 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
498 {
499         int width = info->d_width;
500         int height = info->d_height;
501         Uint32 *srcp = (Uint32 *)info->s_pixels;
502         int srcskip = info->s_skip >> 2;
503         Uint32 *dstp = (Uint32 *)info->d_pixels;
504         int dstskip = info->d_skip >> 2;
505         Uint32 dalpha = info->dst->Amask;
506
507         __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
508         
509         hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
510         lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
511         dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
512
513         while (height--) {
514                 int n = width;
515                 if ( n & 1 ) {
516                         Uint32 s = *srcp++;
517                         Uint32 d = *dstp;
518                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
519                                    + (s & d & 0x00010101)) | dalpha;
520                         n--;
521                 }
522                 
523                 for (n >>= 1; n > 0; --n) {
524                         dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
525                         dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
526
527                         src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
528                         src2 = src1; /* 2 x src -> src2(ARGBARGB) */
529
530                         dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
531                         src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
532                         src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
533                         src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
534
535                         dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
536                         dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
537                         dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
538                         dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
539                         
540                         *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
541                         dstp += 2;
542                         srcp += 2;
543                 }
544                 
545                 srcp += srcskip;
546                 dstp += dstskip;
547         }
548         _mm_empty();
549 }
550
551 /* fast RGB888->(A)RGB888 blending with surface alpha */
552 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
553 {
554         SDL_PixelFormat* df = info->dst;
555         Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
556         unsigned alpha = info->src->alpha;
557
558         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
559                         /* only call a128 version when R,G,B occupy lower bits */
560                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
561         } else {
562                 int width = info->d_width;
563                 int height = info->d_height;
564                 Uint32 *srcp = (Uint32 *)info->s_pixels;
565                 int srcskip = info->s_skip >> 2;
566                 Uint32 *dstp = (Uint32 *)info->d_pixels;
567                 int dstskip = info->d_skip >> 2;
568                 Uint32 dalpha = df->Amask;
569                 Uint32 amult;
570
571                 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
572                 
573                 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
574                 /* form the alpha mult */
575                 amult = alpha | (alpha << 8);
576                 amult = amult | (amult << 16);
577                 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
578                 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
579                 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
580                         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
581                 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
582                 
583                 while (height--) {
584                         int n = width;
585                         if (n & 1) {
586                                 /* One Pixel Blend */
587                                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
588                                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
589
590                                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
591                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
592
593                                 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
594                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
595                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
596                                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
597                                 
598                                 dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
599                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
600                                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
601
602                                 ++srcp;
603                                 ++dstp;
604                                 
605                                 n--;
606                         }
607
608                         for (n >>= 1; n > 0; --n) {
609                                 /* Two Pixels Blend */
610                                 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
611                                 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
612                                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
613                                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
614
615                                 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
616                                 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
617                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
618                                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
619
620                                 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
621                                 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
622                                 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
623                                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
624
625                                 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
626                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
627                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
628                                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
629                                 
630                                 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
631                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
632
633                                 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
634
635                                 srcp += 2;
636                                 dstp += 2;
637                         }
638                         srcp += srcskip;
639                         dstp += dstskip;
640                 }
641                 _mm_empty();
642         }
643 }
644
645 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
646 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
647 {
648         int width = info->d_width;
649         int height = info->d_height;
650         Uint32 *srcp = (Uint32 *)info->s_pixels;
651         int srcskip = info->s_skip >> 2;
652         Uint32 *dstp = (Uint32 *)info->d_pixels;
653         int dstskip = info->d_skip >> 2;
654         SDL_PixelFormat* sf = info->src;
655         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
656         Uint32 amask = sf->Amask;
657         Uint32 ashift = sf->Ashift;
658         Uint64 multmask;
659
660         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
661
662         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
663         multmask = ~(0xFFFFi64 << (ashift * 2));
664         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
665
666         while(height--) {
667                 DUFFS_LOOP4({
668                 Uint32 alpha = *srcp & amask;
669                 if (alpha == 0) {
670                         /* do nothing */
671                 } else if (alpha == amask) {
672                         /* opaque alpha -- copy RGB, keep dst alpha */
673                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
674                 } else {
675                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
676                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
677
678                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
679                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
680
681                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
682                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
683                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
684                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
685                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
686
687                         /* blend */                 
688                         src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
689                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
690                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
691                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
692                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
693                         
694                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
695                 }
696                 ++srcp;
697                 ++dstp;
698             }, width);
699             srcp += srcskip;
700             dstp += dstskip;
701         }
702         _mm_empty();
703 }
704 /* End MSVC_ASMBLIT */
705
706 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
707
708 #if SDL_ALTIVEC_BLITTERS
709 #if __MWERKS__
710 #pragma altivec_model on
711 #endif
712 #if HAVE_ALTIVEC_H
713 #include <altivec.h>
714 #endif
715 #include <assert.h>
716
717 #if (defined(__MACOSX__) && (__GNUC__ < 4))
718     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
719         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
720     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
721         (vector unsigned short) ( a,b,c,d,e,f,g,h )
722 #else
723     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
724         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
725     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
726         (vector unsigned short) { a,b,c,d,e,f,g,h }
727 #endif
728
729 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
730 #define VECPRINT(msg, v) do { \
731     vector unsigned int tmpvec = (vector unsigned int)(v); \
732     unsigned int *vp = (unsigned int *)&tmpvec; \
733     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
734 } while (0)
735
736 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
737     (vector unsigned char)(
738         0x00, 0x10, 0x02, 0x12,
739         0x04, 0x14, 0x06, 0x16,
740         0x08, 0x18, 0x0A, 0x1A,
741         0x0C, 0x1C, 0x0E, 0x1E );
742 */
743 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
744 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
745 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
746 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
747     ? vec_lvsl(0, src) \
748     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
749
750    
751 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
752     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
753     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
754     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
755     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
756     /* valpha2 is 255-alpha */ \
757     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
758     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
759     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
760     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
761     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
762     /* add source and dest */ \
763     vtemp1 = vec_add(vtemp1, vtemp3); \
764     vtemp2 = vec_add(vtemp2, vtemp4); \
765     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
766     vtemp1 = vec_add(vtemp1, v1_16); \
767     vtemp3 = vec_sr(vtemp1, v8_16); \
768     vtemp1 = vec_add(vtemp1, vtemp3); \
769     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
770     vtemp2 = vec_add(vtemp2, v1_16); \
771     vtemp4 = vec_sr(vtemp2, v8_16); \
772     vtemp2 = vec_add(vtemp2, vtemp4); \
773     /* (>>8) and get ARGBARGBARGBARGB */ \
774     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
775 } while (0)
776  
777 /* Calculate the permute vector used for 32->32 swizzling */
778 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
779                                   const SDL_PixelFormat *dstfmt)
780 {
781     /*
782      * We have to assume that the bits that aren't used by other
783      *  colors is alpha, and it's one complete byte, since some formats
784      *  leave alpha with a zero mask, but we should still swizzle the bits.
785      */
786     /* ARGB */
787     const static struct SDL_PixelFormat default_pixel_format = {
788         NULL, 0, 0,
789         0, 0, 0, 0,
790         16, 8, 0, 24,
791         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
792         0, 0};
793     if (!srcfmt) {
794         srcfmt = &default_pixel_format;
795     }
796     if (!dstfmt) {
797         dstfmt = &default_pixel_format;
798     }
799     const vector unsigned char plus = VECUINT8_LITERAL
800                                             ( 0x00, 0x00, 0x00, 0x00,
801                                               0x04, 0x04, 0x04, 0x04,
802                                               0x08, 0x08, 0x08, 0x08,
803                                               0x0C, 0x0C, 0x0C, 0x0C );
804     vector unsigned char vswiz;
805     vector unsigned int srcvec;
806 #define RESHIFT(X) (3 - ((X) >> 3))
807     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
808     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
809     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
810     Uint32 amask;
811     /* Use zero for alpha if either surface doesn't have alpha */
812     if (dstfmt->Amask) {
813         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
814     } else {
815         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
816     }
817 #undef RESHIFT  
818     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
819     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
820     return(vswiz);
821 }
822
823 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
824 {
825     int height = info->d_height;
826     Uint8 *src = (Uint8 *)info->s_pixels;
827     int srcskip = info->s_skip;
828     Uint8 *dst = (Uint8 *)info->d_pixels;
829     int dstskip = info->d_skip;
830     SDL_PixelFormat *srcfmt = info->src;
831
832     vector unsigned char v0 = vec_splat_u8(0);
833     vector unsigned short v8_16 = vec_splat_u16(8);
834     vector unsigned short v1_16 = vec_splat_u16(1);
835     vector unsigned short v2_16 = vec_splat_u16(2);
836     vector unsigned short v3_16 = vec_splat_u16(3);
837     vector unsigned int v8_32 = vec_splat_u32(8);
838     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
839     vector unsigned short v3f = VECUINT16_LITERAL(
840         0x003f, 0x003f, 0x003f, 0x003f,
841         0x003f, 0x003f, 0x003f, 0x003f);
842     vector unsigned short vfc = VECUINT16_LITERAL(
843         0x00fc, 0x00fc, 0x00fc, 0x00fc,
844         0x00fc, 0x00fc, 0x00fc, 0x00fc);
845
846     /* 
847         0x10 - 0x1f is the alpha
848         0x00 - 0x0e evens are the red
849         0x01 - 0x0f odds are zero
850     */
851     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
852         0x10, 0x00, 0x01, 0x01,
853         0x10, 0x02, 0x01, 0x01,
854         0x10, 0x04, 0x01, 0x01,
855         0x10, 0x06, 0x01, 0x01
856     );
857     vector unsigned char vredalpha2 = (vector unsigned char)(
858         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
859     );
860     /*
861         0x00 - 0x0f is ARxx ARxx ARxx ARxx
862         0x11 - 0x0f odds are blue
863     */
864     vector unsigned char vblue1 = VECUINT8_LITERAL(
865         0x00, 0x01, 0x02, 0x11,
866         0x04, 0x05, 0x06, 0x13,
867         0x08, 0x09, 0x0a, 0x15,
868         0x0c, 0x0d, 0x0e, 0x17
869     );
870     vector unsigned char vblue2 = (vector unsigned char)(
871         vec_add((vector unsigned int)vblue1, v8_32)
872     );
873     /*
874         0x00 - 0x0f is ARxB ARxB ARxB ARxB
875         0x10 - 0x0e evens are green
876     */
877     vector unsigned char vgreen1 = VECUINT8_LITERAL(
878         0x00, 0x01, 0x10, 0x03,
879         0x04, 0x05, 0x12, 0x07,
880         0x08, 0x09, 0x14, 0x0b,
881         0x0c, 0x0d, 0x16, 0x0f
882     );
883     vector unsigned char vgreen2 = (vector unsigned char)(
884         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
885     );
886     vector unsigned char vgmerge = VECUINT8_LITERAL(
887         0x00, 0x02, 0x00, 0x06,
888         0x00, 0x0a, 0x00, 0x0e,
889         0x00, 0x12, 0x00, 0x16,
890         0x00, 0x1a, 0x00, 0x1e);
891     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
892     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
893     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
894
895     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
896     vf800 = vec_sl(vf800, vec_splat_u16(8));
897
898     while(height--) {
899         int extrawidth;
900         vector unsigned char valigner;
901         vector unsigned char vsrc;
902         vector unsigned char voverflow;
903         int width = info->d_width;
904
905 #define ONE_PIXEL_BLEND(condition, widthvar) \
906         while (condition) { \
907             Uint32 Pixel; \
908             unsigned sR, sG, sB, dR, dG, dB, sA; \
909             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
910             if(sA) { \
911                 unsigned short dstpixel = *((unsigned short *)dst); \
912                 dR = (dstpixel >> 8) & 0xf8; \
913                 dG = (dstpixel >> 3) & 0xfc; \
914                 dB = (dstpixel << 3) & 0xf8; \
915                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
916                 *((unsigned short *)dst) = ( \
917                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
918                 ); \
919             } \
920             src += 4; \
921             dst += 2; \
922             widthvar--; \
923         }
924         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
925         extrawidth = (width % 8);
926         valigner = VEC_ALIGNER(src);
927         vsrc = (vector unsigned char)vec_ld(0, src);
928         width -= extrawidth;
929         while (width) {
930             vector unsigned char valpha;
931             vector unsigned char vsrc1, vsrc2;
932             vector unsigned char vdst1, vdst2;
933             vector unsigned short vR, vG, vB;
934             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
935
936             /* Load 8 pixels from src as ARGB */
937             voverflow = (vector unsigned char)vec_ld(15, src);
938             vsrc = vec_perm(vsrc, voverflow, valigner);
939             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
940             src += 16;
941             vsrc = (vector unsigned char)vec_ld(15, src);
942             voverflow = vec_perm(voverflow, vsrc, valigner);
943             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
944             src += 16;
945
946             /* Load 8 pixels from dst as XRGB */
947             voverflow = vec_ld(0, dst);
948             vR = vec_and((vector unsigned short)voverflow, vf800);
949             vB = vec_sl((vector unsigned short)voverflow, v3_16);
950             vG = vec_sl(vB, v2_16);
951             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
952             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
953             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
954             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
955             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
956             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
957
958             /* Alpha blend 8 pixels as ARGB */
959             valpha = vec_perm(vsrc1, v0, valphaPermute);
960             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
961             valpha = vec_perm(vsrc2, v0, valphaPermute);
962             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
963
964             /* Convert 8 pixels to 565 */
965             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
966             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
967             vgpixel = vec_and(vgpixel, vfc);
968             vgpixel = vec_sl(vgpixel, v3_16);
969             vrpixel = vec_sl(vpixel, v1_16);
970             vrpixel = vec_and(vrpixel, vf800);
971             vbpixel = vec_and(vpixel, v3f);
972             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
973             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
974             
975             /* Store 8 pixels */
976             vec_st(vdst1, 0, dst);
977
978             width -= 8;
979             dst += 16;
980         }
981         ONE_PIXEL_BLEND((extrawidth), extrawidth);
982 #undef ONE_PIXEL_BLEND
983         src += srcskip;
984         dst += dstskip;
985     }
986 }
987
988 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
989 {
990     unsigned alpha = info->src->alpha;
991     int height = info->d_height;
992     Uint32 *srcp = (Uint32 *)info->s_pixels;
993     int srcskip = info->s_skip >> 2;
994     Uint32 *dstp = (Uint32 *)info->d_pixels;
995     int dstskip = info->d_skip >> 2;
996     SDL_PixelFormat *srcfmt = info->src;
997     SDL_PixelFormat *dstfmt = info->dst;
998     unsigned sA = srcfmt->alpha;
999     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1000     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1001     Uint32 ckey = info->src->colorkey;
1002     vector unsigned char mergePermute;
1003     vector unsigned char vsrcPermute;
1004     vector unsigned char vdstPermute;
1005     vector unsigned char vsdstPermute;
1006     vector unsigned char valpha;
1007     vector unsigned char valphamask;
1008     vector unsigned char vbits;
1009     vector unsigned char v0;
1010     vector unsigned short v1;
1011     vector unsigned short v8;
1012     vector unsigned int vckey;
1013     vector unsigned int vrgbmask;
1014
1015     mergePermute = VEC_MERGE_PERMUTE();
1016     v0 = vec_splat_u8(0);
1017     v1 = vec_splat_u16(1);
1018     v8 = vec_splat_u16(8);
1019
1020     /* set the alpha to 255 on the destination surf */
1021     valphamask = VEC_ALPHA_MASK();
1022
1023     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1024     vdstPermute = calc_swizzle32(NULL, dstfmt);
1025     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1026
1027     /* set a vector full of alpha and 255-alpha */
1028     ((unsigned char *)&valpha)[0] = alpha;
1029     valpha = vec_splat(valpha, 0);
1030     vbits = (vector unsigned char)vec_splat_s8(-1);
1031
1032     ckey &= rgbmask;
1033     ((unsigned int *)(char*)&vckey)[0] = ckey;
1034     vckey = vec_splat(vckey, 0);
1035     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1036     vrgbmask = vec_splat(vrgbmask, 0);
1037
1038     while(height--) {
1039         int width = info->d_width;
1040 #define ONE_PIXEL_BLEND(condition, widthvar) \
1041         while (condition) { \
1042             Uint32 Pixel; \
1043             unsigned sR, sG, sB, dR, dG, dB; \
1044             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1045             if(sA && Pixel != ckey) { \
1046                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1047                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1048                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1049                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1050             } \
1051             dstp++; \
1052             srcp++; \
1053             widthvar--; \
1054         }
1055         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1056         if (width > 0) {
1057             int extrawidth = (width % 4);
1058             vector unsigned char valigner = VEC_ALIGNER(srcp);
1059             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1060             width -= extrawidth;
1061             while (width) {
1062                 vector unsigned char vsel;
1063                 vector unsigned char voverflow;
1064                 vector unsigned char vd;
1065                 vector unsigned char vd_orig;
1066
1067                 /* s = *srcp */
1068                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1069                 vs = vec_perm(vs, voverflow, valigner);
1070                 
1071                 /* vsel is set for items that match the key */
1072                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1073                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1074
1075                 /* permute to source format */
1076                 vs = vec_perm(vs, valpha, vsrcPermute);
1077
1078                 /* d = *dstp */
1079                 vd = (vector unsigned char)vec_ld(0, dstp);
1080                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1081
1082                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1083
1084                 /* set the alpha channel to full on */
1085                 vd = vec_or(vd, valphamask);
1086
1087                 /* mask out color key */
1088                 vd = vec_sel(vd, vd_orig, vsel);
1089                 
1090                 /* permute to dest format */
1091                 vd = vec_perm(vd, vbits, vdstPermute);
1092
1093                 /* *dstp = res */
1094                 vec_st((vector unsigned int)vd, 0, dstp);
1095                 
1096                 srcp += 4;
1097                 dstp += 4;
1098                 width -= 4;
1099                 vs = voverflow;
1100             }
1101             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1102         }
1103 #undef ONE_PIXEL_BLEND
1104  
1105         srcp += srcskip;
1106         dstp += dstskip;
1107     }
1108 }
1109
1110
1111 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1112 {
1113     int width = info->d_width;
1114     int height = info->d_height;
1115     Uint32 *srcp = (Uint32 *)info->s_pixels;
1116     int srcskip = info->s_skip >> 2;
1117     Uint32 *dstp = (Uint32 *)info->d_pixels;
1118     int dstskip = info->d_skip >> 2;
1119     SDL_PixelFormat *srcfmt = info->src;
1120     SDL_PixelFormat *dstfmt = info->dst;
1121     vector unsigned char mergePermute;
1122     vector unsigned char valphaPermute;
1123     vector unsigned char vsrcPermute;
1124     vector unsigned char vdstPermute;
1125     vector unsigned char vsdstPermute;
1126     vector unsigned char valphamask;
1127     vector unsigned char vpixelmask;
1128     vector unsigned char v0;
1129     vector unsigned short v1;
1130     vector unsigned short v8;
1131
1132     v0 = vec_splat_u8(0);
1133     v1 = vec_splat_u16(1);
1134     v8 = vec_splat_u16(8);
1135     mergePermute = VEC_MERGE_PERMUTE();
1136     valphamask = VEC_ALPHA_MASK();
1137     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1138     vpixelmask = vec_nor(valphamask, v0);
1139     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1140     vdstPermute = calc_swizzle32(NULL, dstfmt);
1141     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1142
1143         while ( height-- ) {
1144         width = info->d_width;
1145 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1146             Uint32 Pixel; \
1147             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1148             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1149             if(sA) { \
1150               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1151               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1152               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1153             } \
1154             ++srcp; \
1155             ++dstp; \
1156             widthvar--; \
1157         }
1158         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1159         if (width > 0) {
1160             /* vsrcPermute */
1161             /* vdstPermute */
1162             int extrawidth = (width % 4);
1163             vector unsigned char valigner = VEC_ALIGNER(srcp);
1164             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1165             width -= extrawidth;
1166             while (width) {
1167                 vector unsigned char voverflow;
1168                 vector unsigned char vd;
1169                 vector unsigned char valpha;
1170                 vector unsigned char vdstalpha;
1171                 /* s = *srcp */
1172                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1173                 vs = vec_perm(vs, voverflow, valigner);
1174                 vs = vec_perm(vs, v0, vsrcPermute);
1175
1176                 valpha = vec_perm(vs, v0, valphaPermute);
1177                 
1178                 /* d = *dstp */
1179                 vd = (vector unsigned char)vec_ld(0, dstp);
1180                 vd = vec_perm(vd, v0, vsdstPermute);
1181                 vdstalpha = vec_and(vd, valphamask);
1182
1183                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1184
1185                 /* set the alpha to the dest alpha */
1186                 vd = vec_and(vd, vpixelmask);
1187                 vd = vec_or(vd, vdstalpha);
1188                 vd = vec_perm(vd, v0, vdstPermute);
1189
1190                 /* *dstp = res */
1191                 vec_st((vector unsigned int)vd, 0, dstp);
1192                 
1193                 srcp += 4;
1194                 dstp += 4;
1195                 width -= 4;
1196                 vs = voverflow;
1197
1198             }
1199             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1200         }
1201             srcp += srcskip;
1202             dstp += dstskip;
1203 #undef ONE_PIXEL_BLEND
1204         }
1205 }
1206
1207 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1208 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1209 {
1210         int width = info->d_width;
1211         int height = info->d_height;
1212         Uint32 *srcp = (Uint32 *)info->s_pixels;
1213         int srcskip = info->s_skip >> 2;
1214         Uint32 *dstp = (Uint32 *)info->d_pixels;
1215         int dstskip = info->d_skip >> 2;
1216     vector unsigned char mergePermute;
1217     vector unsigned char valphaPermute;
1218     vector unsigned char valphamask;
1219     vector unsigned char vpixelmask;
1220     vector unsigned char v0;
1221     vector unsigned short v1;
1222     vector unsigned short v8;
1223     v0 = vec_splat_u8(0);
1224     v1 = vec_splat_u16(1);
1225     v8 = vec_splat_u16(8);
1226     mergePermute = VEC_MERGE_PERMUTE();
1227     valphamask = VEC_ALPHA_MASK();
1228     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1229     
1230  
1231     vpixelmask = vec_nor(valphamask, v0);
1232         while(height--) {
1233         width = info->d_width;
1234 #define ONE_PIXEL_BLEND(condition, widthvar) \
1235         while ((condition)) { \
1236             Uint32 dalpha; \
1237             Uint32 d; \
1238             Uint32 s1; \
1239             Uint32 d1; \
1240             Uint32 s = *srcp; \
1241             Uint32 alpha = s >> 24; \
1242             if(alpha) { \
1243               if(alpha == SDL_ALPHA_OPAQUE) { \
1244                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1245               } else { \
1246                 d = *dstp; \
1247                 dalpha = d & 0xff000000; \
1248                 s1 = s & 0xff00ff; \
1249                 d1 = d & 0xff00ff; \
1250                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1251                 s &= 0xff00; \
1252                 d &= 0xff00; \
1253                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1254                 *dstp = d1 | d | dalpha; \
1255               } \
1256             } \
1257             ++srcp; \
1258             ++dstp; \
1259             widthvar--; \
1260             }
1261         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1262         if (width > 0) {
1263             int extrawidth = (width % 4);
1264             vector unsigned char valigner = VEC_ALIGNER(srcp);
1265             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1266             width -= extrawidth;
1267             while (width) {
1268                 vector unsigned char voverflow;
1269                 vector unsigned char vd;
1270                 vector unsigned char valpha;
1271                 vector unsigned char vdstalpha;
1272                 /* s = *srcp */
1273                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1274                 vs = vec_perm(vs, voverflow, valigner);
1275
1276                 valpha = vec_perm(vs, v0, valphaPermute);
1277                 
1278                 /* d = *dstp */
1279                 vd = (vector unsigned char)vec_ld(0, dstp);
1280                 vdstalpha = vec_and(vd, valphamask);
1281
1282                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1283
1284                 /* set the alpha to the dest alpha */
1285                 vd = vec_and(vd, vpixelmask);
1286                 vd = vec_or(vd, vdstalpha);
1287
1288                 /* *dstp = res */
1289                 vec_st((vector unsigned int)vd, 0, dstp);
1290                 
1291                 srcp += 4;
1292                 dstp += 4;
1293                 width -= 4;
1294                 vs = voverflow;
1295             }
1296             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1297         }
1298             srcp += srcskip;
1299             dstp += dstskip;
1300         }
1301 #undef ONE_PIXEL_BLEND
1302 }
1303
1304 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1305 {
1306     /* XXX : 6 */
1307         unsigned alpha = info->src->alpha;
1308     int height = info->d_height;
1309     Uint32 *srcp = (Uint32 *)info->s_pixels;
1310     int srcskip = info->s_skip >> 2;
1311     Uint32 *dstp = (Uint32 *)info->d_pixels;
1312     int dstskip = info->d_skip >> 2;
1313     SDL_PixelFormat *srcfmt = info->src;
1314     SDL_PixelFormat *dstfmt = info->dst;
1315         unsigned sA = srcfmt->alpha;
1316         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1317     vector unsigned char mergePermute;
1318     vector unsigned char vsrcPermute;
1319     vector unsigned char vdstPermute;
1320     vector unsigned char vsdstPermute;
1321     vector unsigned char valpha;
1322     vector unsigned char valphamask;
1323     vector unsigned char vbits;
1324     vector unsigned short v1;
1325     vector unsigned short v8;
1326
1327     mergePermute = VEC_MERGE_PERMUTE();
1328     v1 = vec_splat_u16(1);
1329     v8 = vec_splat_u16(8);
1330
1331     /* set the alpha to 255 on the destination surf */
1332     valphamask = VEC_ALPHA_MASK();
1333
1334     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1335     vdstPermute = calc_swizzle32(NULL, dstfmt);
1336     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1337
1338     /* set a vector full of alpha and 255-alpha */
1339     ((unsigned char *)&valpha)[0] = alpha;
1340     valpha = vec_splat(valpha, 0);
1341     vbits = (vector unsigned char)vec_splat_s8(-1);
1342
1343     while(height--) {
1344         int width = info->d_width;
1345 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1346             Uint32 Pixel; \
1347             unsigned sR, sG, sB, dR, dG, dB; \
1348             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1349             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1350             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1351             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1352             ++srcp; \
1353             ++dstp; \
1354             widthvar--; \
1355         }
1356         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1357         if (width > 0) {
1358             int extrawidth = (width % 4);
1359             vector unsigned char valigner = VEC_ALIGNER(srcp);
1360             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1361             width -= extrawidth;
1362             while (width) {
1363                 vector unsigned char voverflow;
1364                 vector unsigned char vd;
1365
1366                 /* s = *srcp */
1367                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1368                 vs = vec_perm(vs, voverflow, valigner);
1369                 vs = vec_perm(vs, valpha, vsrcPermute);
1370                 
1371                 /* d = *dstp */
1372                 vd = (vector unsigned char)vec_ld(0, dstp);
1373                 vd = vec_perm(vd, vd, vsdstPermute);
1374
1375                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1376
1377                 /* set the alpha channel to full on */
1378                 vd = vec_or(vd, valphamask);
1379                 vd = vec_perm(vd, vbits, vdstPermute);
1380
1381                 /* *dstp = res */
1382                 vec_st((vector unsigned int)vd, 0, dstp);
1383                 
1384                 srcp += 4;
1385                 dstp += 4;
1386                 width -= 4;
1387                 vs = voverflow;
1388             }
1389             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1390         }
1391 #undef ONE_PIXEL_BLEND
1392  
1393         srcp += srcskip;
1394         dstp += dstskip;
1395     }
1396
1397 }
1398
1399
1400 /* fast RGB888->(A)RGB888 blending */
1401 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1402 {
1403         unsigned alpha = info->src->alpha;
1404     int height = info->d_height;
1405     Uint32 *srcp = (Uint32 *)info->s_pixels;
1406     int srcskip = info->s_skip >> 2;
1407     Uint32 *dstp = (Uint32 *)info->d_pixels;
1408     int dstskip = info->d_skip >> 2;
1409     vector unsigned char mergePermute;
1410     vector unsigned char valpha;
1411     vector unsigned char valphamask;
1412     vector unsigned short v1;
1413     vector unsigned short v8;
1414
1415     mergePermute = VEC_MERGE_PERMUTE();
1416     v1 = vec_splat_u16(1);
1417     v8 = vec_splat_u16(8);
1418
1419     /* set the alpha to 255 on the destination surf */
1420     valphamask = VEC_ALPHA_MASK();
1421
1422     /* set a vector full of alpha and 255-alpha */
1423     ((unsigned char *)&valpha)[0] = alpha;
1424     valpha = vec_splat(valpha, 0);
1425
1426     while(height--) {
1427         int width = info->d_width;
1428 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1429             Uint32 s = *srcp; \
1430             Uint32 d = *dstp; \
1431             Uint32 s1 = s & 0xff00ff; \
1432             Uint32 d1 = d & 0xff00ff; \
1433             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1434                  & 0xff00ff; \
1435             s &= 0xff00; \
1436             d &= 0xff00; \
1437             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1438             *dstp = d1 | d | 0xff000000; \
1439             ++srcp; \
1440             ++dstp; \
1441             widthvar--; \
1442         }
1443         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1444         if (width > 0) {
1445             int extrawidth = (width % 4);
1446             vector unsigned char valigner = VEC_ALIGNER(srcp);
1447             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1448             width -= extrawidth;
1449             while (width) {
1450                 vector unsigned char voverflow;
1451                 vector unsigned char vd;
1452
1453                 /* s = *srcp */
1454                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1455                 vs = vec_perm(vs, voverflow, valigner);
1456                 
1457                 /* d = *dstp */
1458                 vd = (vector unsigned char)vec_ld(0, dstp);
1459
1460                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1461
1462                 /* set the alpha channel to full on */
1463                 vd = vec_or(vd, valphamask);
1464
1465                 /* *dstp = res */
1466                 vec_st((vector unsigned int)vd, 0, dstp);
1467                 
1468                 srcp += 4;
1469                 dstp += 4;
1470                 width -= 4;
1471                 vs = voverflow;
1472             }
1473             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1474         }
1475 #undef ONE_PIXEL_BLEND
1476  
1477         srcp += srcskip;
1478         dstp += dstskip;
1479     }
1480 }
1481 #if __MWERKS__
1482 #pragma altivec_model off
1483 #endif
1484 #endif /* SDL_ALTIVEC_BLITTERS */
1485
1486 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1487 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1488 {
1489         int width = info->d_width;
1490         int height = info->d_height;
1491         Uint32 *srcp = (Uint32 *)info->s_pixels;
1492         int srcskip = info->s_skip >> 2;
1493         Uint32 *dstp = (Uint32 *)info->d_pixels;
1494         int dstskip = info->d_skip >> 2;
1495
1496         while(height--) {
1497             DUFFS_LOOP4({
1498                     Uint32 s = *srcp++;
1499                     Uint32 d = *dstp;
1500                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1501                                + (s & d & 0x00010101)) | 0xff000000;
1502             }, width);
1503             srcp += srcskip;
1504             dstp += dstskip;
1505         }
1506 }
1507
1508 /* fast RGB888->(A)RGB888 blending with surface alpha */
1509 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1510 {
1511         unsigned alpha = info->src->alpha;
1512         if(alpha == 128) {
1513                 BlitRGBtoRGBSurfaceAlpha128(info);
1514         } else {
1515                 int width = info->d_width;
1516                 int height = info->d_height;
1517                 Uint32 *srcp = (Uint32 *)info->s_pixels;
1518                 int srcskip = info->s_skip >> 2;
1519                 Uint32 *dstp = (Uint32 *)info->d_pixels;
1520                 int dstskip = info->d_skip >> 2;
1521                 Uint32 s;
1522                 Uint32 d;
1523                 Uint32 s1;
1524                 Uint32 d1;
1525
1526                 while(height--) {
1527                         DUFFS_LOOP_DOUBLE2({
1528                                 /* One Pixel Blend */
1529                                 s = *srcp;
1530                                 d = *dstp;
1531                                 s1 = s & 0xff00ff;
1532                                 d1 = d & 0xff00ff;
1533                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1534                                      & 0xff00ff;
1535                                 s &= 0xff00;
1536                                 d &= 0xff00;
1537                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1538                                 *dstp = d1 | d | 0xff000000;
1539                                 ++srcp;
1540                                 ++dstp;
1541                         },{
1542                                 /* Two Pixels Blend */
1543                                 s = *srcp;
1544                                 d = *dstp;
1545                                 s1 = s & 0xff00ff;
1546                                 d1 = d & 0xff00ff;
1547                                 d1 += (s1 - d1) * alpha >> 8;
1548                                 d1 &= 0xff00ff;
1549                                      
1550                                 s = ((s & 0xff00) >> 8) | 
1551                                         ((srcp[1] & 0xff00) << 8);
1552                                 d = ((d & 0xff00) >> 8) |
1553                                         ((dstp[1] & 0xff00) << 8);
1554                                 d += (s - d) * alpha >> 8;
1555                                 d &= 0x00ff00ff;
1556                                 
1557                                 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1558                                 ++srcp;
1559                                 
1560                                 s1 = *srcp;
1561                                 d1 = *dstp;
1562                                 s1 &= 0xff00ff;
1563                                 d1 &= 0xff00ff;
1564                                 d1 += (s1 - d1) * alpha >> 8;
1565                                 d1 &= 0xff00ff;
1566                                 
1567                                 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1568                                 ++srcp;
1569                                 ++dstp;
1570                         }, width);
1571                         srcp += srcskip;
1572                         dstp += dstskip;
1573                 }
1574         }
1575 }
1576
1577 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1578 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1579 {
1580         int width = info->d_width;
1581         int height = info->d_height;
1582         Uint32 *srcp = (Uint32 *)info->s_pixels;
1583         int srcskip = info->s_skip >> 2;
1584         Uint32 *dstp = (Uint32 *)info->d_pixels;
1585         int dstskip = info->d_skip >> 2;
1586
1587         while(height--) {
1588             DUFFS_LOOP4({
1589                 Uint32 dalpha;
1590                 Uint32 d;
1591                 Uint32 s1;
1592                 Uint32 d1;
1593                 Uint32 s = *srcp;
1594                 Uint32 alpha = s >> 24;
1595                 /* FIXME: Here we special-case opaque alpha since the
1596                    compositioning used (>>8 instead of /255) doesn't handle
1597                    it correctly. Also special-case alpha=0 for speed?
1598                    Benchmark this! */
1599                 if(alpha) {   
1600                   if(alpha == SDL_ALPHA_OPAQUE) {
1601                     *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1602                   } else {
1603                     /*
1604                      * take out the middle component (green), and process
1605                      * the other two in parallel. One multiply less.
1606                      */
1607                     d = *dstp;
1608                     dalpha = d & 0xff000000;
1609                     s1 = s & 0xff00ff;
1610                     d1 = d & 0xff00ff;
1611                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1612                     s &= 0xff00;
1613                     d &= 0xff00;
1614                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1615                     *dstp = d1 | d | dalpha;
1616                   }
1617                 }
1618                 ++srcp;
1619                 ++dstp;
1620             }, width);
1621             srcp += srcskip;
1622             dstp += dstskip;
1623         }
1624 }
1625
1626 #if GCC_ASMBLIT
1627 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1628 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1629 {
1630         int width = info->d_width;
1631         int height = info->d_height;
1632         Uint32 *srcp = (Uint32 *)info->s_pixels;
1633         int srcskip = info->s_skip >> 2;
1634         Uint32 *dstp = (Uint32 *)info->d_pixels;
1635         int dstskip = info->d_skip >> 2;
1636         SDL_PixelFormat* sf = info->src;
1637         Uint32 amask = sf->Amask;
1638
1639         __asm__ (
1640         /* make mm6 all zeros. */
1641         "pxor       %%mm6, %%mm6\n"
1642         
1643         /* Make a mask to preserve the alpha. */
1644         "movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1645         "punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1646         "pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1647         "movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1648         "pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1649
1650         /* form channel masks */
1651         "movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1652         "packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1653         "packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1654         "pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1655         
1656         /* get alpha channel shift */
1657         "movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1658
1659           : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1660
1661         while(height--) {
1662
1663             DUFFS_LOOP4({
1664                 Uint32 alpha;
1665
1666                 __asm__ (
1667                 "prefetch 64(%0)\n"
1668                 "prefetch 64(%1)\n"
1669                         : : "r" (srcp), "r" (dstp) );
1670
1671                 alpha = *srcp & amask;
1672                 /* FIXME: Here we special-case opaque alpha since the
1673                    compositioning used (>>8 instead of /255) doesn't handle
1674                    it correctly. Also special-case alpha=0 for speed?
1675                    Benchmark this! */
1676                 if(alpha == 0) {
1677                     /* do nothing */
1678                 }
1679                 else if(alpha == amask) {
1680                         /* opaque alpha -- copy RGB, keep dst alpha */
1681                     /* using MMX here to free up regular registers for other things */
1682                             __asm__ (
1683                     "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1684                     "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1685                     "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1686                     "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1687                     "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1688                     "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1689
1690                      : : "r" (srcp), "r" (dstp) );
1691                 } 
1692
1693                 else {
1694                             __asm__ (
1695                     /* load in the source, and dst. */
1696                     "movd      (%0), %%mm0\n"               /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1697                     "movd      (%1), %%mm1\n"               /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1698
1699                     /* Move the src alpha into mm2 */
1700
1701                     /* if supporting pshufw */
1702                     /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1703                     /*"psrlw     $8, %%mm2\n" */
1704                     
1705                     /* else: */
1706                     "movd       %2,    %%mm2\n"
1707                     "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1708                     "punpcklwd  %%mm2, %%mm2\n"             /* mm2 = 0 0 0 0 |  0 As  0  As */
1709                     "punpckldq  %%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1710                     "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1711
1712                     /* move the colors into words. */
1713                     "punpcklbw %%mm6, %%mm0\n"              /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1714                     "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1715
1716                     /* src - dst */
1717                     "psubw    %%mm1, %%mm0\n"               /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1718
1719                     /* A * (src-dst) */
1720                     "pmullw    %%mm2, %%mm0\n"              /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1721                     "psrlw     $8,    %%mm0\n"              /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1722                     "paddb     %%mm1, %%mm0\n"              /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1723
1724                     "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1725                     
1726                     "movd      %%mm0, (%1)\n"               /* result in mm0 */
1727
1728                      : : "r" (srcp), "r" (dstp), "r" (alpha) );
1729
1730                 }
1731                 ++srcp;
1732                 ++dstp;
1733             }, width);
1734             srcp += srcskip;
1735             dstp += dstskip;
1736         }
1737
1738         __asm__ (
1739         "emms\n"
1740                 :   );
1741 }
1742 /* End GCC_ASMBLIT*/
1743
1744 #elif MSVC_ASMBLIT
1745 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1746 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1747 {
1748         int width = info->d_width;
1749         int height = info->d_height;
1750         Uint32 *srcp = (Uint32 *)info->s_pixels;
1751         int srcskip = info->s_skip >> 2;
1752         Uint32 *dstp = (Uint32 *)info->d_pixels;
1753         int dstskip = info->d_skip >> 2;
1754         SDL_PixelFormat* sf = info->src;
1755         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1756         Uint32 amask = sf->Amask;
1757         Uint32 ashift = sf->Ashift;
1758         Uint64 multmask;
1759         
1760         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1761
1762         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1763         multmask = ~(0xFFFFi64 << (ashift * 2));
1764         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1765
1766         while(height--) {
1767             DUFFS_LOOP4({
1768                 Uint32 alpha;
1769
1770                 _m_prefetch(srcp + 16);
1771                 _m_prefetch(dstp + 16);
1772
1773                 alpha = *srcp & amask;
1774                 if (alpha == 0) {
1775                         /* do nothing */
1776                 } else if (alpha == amask) {
1777                         /* copy RGB, keep dst alpha */
1778                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1779                 } else {
1780                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1781                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1782
1783                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1784                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1785
1786                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1787                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1788                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1789                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1790                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1791
1792                         /* blend */                 
1793                         src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1794                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1795                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1796                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1797                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1798                         
1799                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1800                 }
1801                 ++srcp;
1802                 ++dstp;
1803             }, width);
1804             srcp += srcskip;
1805             dstp += dstskip;
1806         }
1807         _mm_empty();
1808 }
1809 /* End MSVC_ASMBLIT */
1810
1811 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1812
1813 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1814
1815 /* blend a single 16 bit pixel at 50% */
1816 #define BLEND16_50(d, s, mask)                                          \
1817         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1818
1819 /* blend two 16 bit pixels at 50% */
1820 #define BLEND2x16_50(d, s, mask)                                             \
1821         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1822          + (s & d & (~(mask | mask << 16))))
1823
1824 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1825 {
1826         int width = info->d_width;
1827         int height = info->d_height;
1828         Uint16 *srcp = (Uint16 *)info->s_pixels;
1829         int srcskip = info->s_skip >> 1;
1830         Uint16 *dstp = (Uint16 *)info->d_pixels;
1831         int dstskip = info->d_skip >> 1;
1832
1833         while(height--) {
1834                 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1835                         /*
1836                          * Source and destination not aligned, pipeline it.
1837                          * This is mostly a win for big blits but no loss for
1838                          * small ones
1839                          */
1840                         Uint32 prev_sw;
1841                         int w = width;
1842
1843                         /* handle odd destination */
1844                         if((uintptr_t)dstp & 2) {
1845                                 Uint16 d = *dstp, s = *srcp;
1846                                 *dstp = BLEND16_50(d, s, mask);
1847                                 dstp++;
1848                                 srcp++;
1849                                 w--;
1850                         }
1851                         srcp++; /* srcp is now 32-bit aligned */
1852
1853                         /* bootstrap pipeline with first halfword */
1854                         prev_sw = ((Uint32 *)srcp)[-1];
1855
1856                         while(w > 1) {
1857                                 Uint32 sw, dw, s;
1858                                 sw = *(Uint32 *)srcp;
1859                                 dw = *(Uint32 *)dstp;
1860 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1861                                 s = (prev_sw << 16) + (sw >> 16);
1862 #else
1863                                 s = (prev_sw >> 16) + (sw << 16);
1864 #endif
1865                                 prev_sw = sw;
1866                                 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1867                                 dstp += 2;
1868                                 srcp += 2;
1869                                 w -= 2;
1870                         }
1871
1872                         /* final pixel if any */
1873                         if(w) {
1874                                 Uint16 d = *dstp, s;
1875 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1876                                 s = (Uint16)prev_sw;
1877 #else
1878                                 s = (Uint16)(prev_sw >> 16);
1879 #endif
1880                                 *dstp = BLEND16_50(d, s, mask);
1881                                 srcp++;
1882                                 dstp++;
1883                         }
1884                         srcp += srcskip - 1;
1885                         dstp += dstskip;
1886                 } else {
1887                         /* source and destination are aligned */
1888                         int w = width;
1889
1890                         /* first odd pixel? */
1891                         if((uintptr_t)srcp & 2) {
1892                                 Uint16 d = *dstp, s = *srcp;
1893                                 *dstp = BLEND16_50(d, s, mask);
1894                                 srcp++;
1895                                 dstp++;
1896                                 w--;
1897                         }
1898                         /* srcp and dstp are now 32-bit aligned */
1899
1900                         while(w > 1) {
1901                                 Uint32 sw = *(Uint32 *)srcp;
1902                                 Uint32 dw = *(Uint32 *)dstp;
1903                                 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1904                                 srcp += 2;
1905                                 dstp += 2;
1906                                 w -= 2;
1907                         }
1908
1909                         /* last odd pixel? */
1910                         if(w) {
1911                                 Uint16 d = *dstp, s = *srcp;
1912                                 *dstp = BLEND16_50(d, s, mask);
1913                                 srcp++;
1914                                 dstp++;
1915                         }
1916                         srcp += srcskip;
1917                         dstp += dstskip;
1918                 }
1919         }
1920 }
1921
1922 #if GCC_ASMBLIT
1923 /* fast RGB565->RGB565 blending with surface alpha */
1924 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1925 {
1926         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1927         if(alpha == 128) {
1928                 Blit16to16SurfaceAlpha128(info, 0xf7de);
1929         } else {
1930                 int width = info->d_width;
1931                 int height = info->d_height;
1932                 Uint16 *srcp = (Uint16 *)info->s_pixels;
1933                 int srcskip = info->s_skip >> 1;
1934                 Uint16 *dstp = (Uint16 *)info->d_pixels;
1935                 int dstskip = info->d_skip >> 1;
1936                 Uint32 s, d;
1937                 Uint64 load;
1938           
1939                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
1940                 load = alpha;
1941                 alpha >>= 3;            /* downscale alpha to 5 bits */
1942
1943                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1944                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1945                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1946                 /* position alpha to allow for mullo and mulhi on diff channels
1947                    to reduce the number of operations */
1948                 psllq_i2r(3, mm0);
1949           
1950                 /* Setup the 565 color channel masks */
1951                 load = 0x07E007E007E007E0ULL;
1952                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1953                 load = 0x001F001F001F001FULL;
1954                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1955                 while(height--) {
1956                         DUFFS_LOOP_QUATRO2(
1957                         {
1958                                 s = *srcp++;
1959                                 d = *dstp;
1960                                 /*
1961                                  * shift out the middle component (green) to
1962                                  * the high 16 bits, and process all three RGB
1963                                  * components at the same time.
1964                                  */
1965                                 s = (s | s << 16) & 0x07e0f81f;
1966                                 d = (d | d << 16) & 0x07e0f81f;
1967                                 d += (s - d) * alpha >> 5;
1968                                 d &= 0x07e0f81f;
1969                                 *dstp++ = d | d >> 16;
1970                         },{
1971                                 s = *srcp++;
1972                                 d = *dstp;
1973                                 /*
1974                                  * shift out the middle component (green) to
1975                                  * the high 16 bits, and process all three RGB
1976                                  * components at the same time.
1977                                  */
1978                                 s = (s | s << 16) & 0x07e0f81f;
1979                                 d = (d | d << 16) & 0x07e0f81f;
1980                                 d += (s - d) * alpha >> 5;
1981                                 d &= 0x07e0f81f;
1982                                 *dstp++ = d | d >> 16;
1983                                 s = *srcp++;
1984                                 d = *dstp;
1985                                 /*
1986                                  * shift out the middle component (green) to
1987                                  * the high 16 bits, and process all three RGB
1988                                  * components at the same time.
1989                                  */
1990                                 s = (s | s << 16) & 0x07e0f81f;
1991                                 d = (d | d << 16) & 0x07e0f81f;
1992                                 d += (s - d) * alpha >> 5;
1993                                 d &= 0x07e0f81f;
1994                                 *dstp++ = d | d >> 16;
1995                         },{
1996                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1997                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1998
1999                                 /* red -- does not need a mask since the right shift clears
2000                                    the uninteresting bits */
2001                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2002                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2003                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2004                                 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2005
2006                                 /* blend */
2007                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2008                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2009                                 /* alpha used is actually 11 bits
2010                                    11 + 5 = 16 bits, so the sign bits are lost */
2011                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2012                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2013                                 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2014
2015                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2016
2017                                 /* green -- process the bits in place */
2018                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2019                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2020                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2021                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2022
2023                                 /* blend */
2024                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2025                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2026                                 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2027                                    bits are gone and the sign bits present */
2028                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2029                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2030
2031                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2032
2033                                 /* blue */
2034                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2035                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2036                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2037                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2038
2039                                 /* blend */
2040                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2041                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2042                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2043                                    the interesting bits will need to be MASKed */
2044                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2045                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2046                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2047
2048                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2049
2050                                 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2051
2052                                 srcp += 4;
2053                                 dstp += 4;
2054                         }, width);                      
2055                         srcp += srcskip;
2056                         dstp += dstskip;
2057                 }
2058                 emms();
2059         }
2060 }
2061
2062 /* fast RGB555->RGB555 blending with surface alpha */
2063 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2064 {
2065         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2066         if(alpha == 128) {
2067                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2068         } else {
2069                 int width = info->d_width;
2070                 int height = info->d_height;
2071                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2072                 int srcskip = info->s_skip >> 1;
2073                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2074                 int dstskip = info->d_skip >> 1;
2075                 Uint32 s, d;
2076                 Uint64 load;
2077           
2078                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2079                 load = alpha;
2080                 alpha >>= 3;            /* downscale alpha to 5 bits */
2081
2082                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2083                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2084                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2085                 /* position alpha to allow for mullo and mulhi on diff channels
2086                    to reduce the number of operations */
2087                 psllq_i2r(3, mm0);
2088
2089                 /* Setup the 555 color channel masks */
2090                 load = 0x03E003E003E003E0ULL;
2091                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2092                 load = 0x001F001F001F001FULL;
2093                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2094                 while(height--) {
2095                         DUFFS_LOOP_QUATRO2(
2096                         {
2097                                 s = *srcp++;
2098                                 d = *dstp;
2099                                 /*
2100                                  * shift out the middle component (green) to
2101                                  * the high 16 bits, and process all three RGB
2102                                  * components at the same time.
2103                                  */
2104                                 s = (s | s << 16) & 0x03e07c1f;
2105                                 d = (d | d << 16) & 0x03e07c1f;
2106                                 d += (s - d) * alpha >> 5;
2107                                 d &= 0x03e07c1f;
2108                                 *dstp++ = d | d >> 16;
2109                         },{
2110                                 s = *srcp++;
2111                                 d = *dstp;
2112                                 /*
2113                                  * shift out the middle component (green) to
2114                                  * the high 16 bits, and process all three RGB
2115                                  * components at the same time.
2116                                  */
2117                                 s = (s | s << 16) & 0x03e07c1f;
2118                                 d = (d | d << 16) & 0x03e07c1f;
2119                                 d += (s - d) * alpha >> 5;
2120                                 d &= 0x03e07c1f;
2121                                 *dstp++ = d | d >> 16;
2122                                 s = *srcp++;
2123                                 d = *dstp;
2124                                 /*
2125                                  * shift out the middle component (green) to
2126                                  * the high 16 bits, and process all three RGB
2127                                  * components at the same time.
2128                                  */
2129                                 s = (s | s << 16) & 0x03e07c1f;
2130                                 d = (d | d << 16) & 0x03e07c1f;
2131                                 d += (s - d) * alpha >> 5;
2132                                 d &= 0x03e07c1f;
2133                                 *dstp++ = d | d >> 16;
2134                         },{
2135                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2136                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2137
2138                                 /* red -- process the bits in place */
2139                                 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2140                                         /* by reusing the GREEN mask we free up another mmx
2141                                            register to accumulate the result */
2142
2143                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2144                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2145                                 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2146                                 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2147
2148                                 /* blend */
2149                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2150                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2151                                 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2152                                    cleared by a MASK below */
2153                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2154                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2155                                 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2156
2157                                 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2158
2159                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2160
2161                                 /* green -- process the bits in place */
2162                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2163                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2164                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2165                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2166
2167                                 /* blend */
2168                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2169                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2170                                 /* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2171                                    bits are gone and the sign bits present */
2172                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2173                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2174
2175                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2176
2177                                 /* blue */
2178                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2179                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2180                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2181                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2182
2183                                 /* blend */
2184                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2185                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2186                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2187                                    the interesting bits will need to be MASKed */
2188                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2189                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2190                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2191
2192                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2193
2194                                 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2195
2196                                 srcp += 4;
2197                                 dstp += 4;
2198                         }, width);                      
2199                         srcp += srcskip;
2200                         dstp += dstskip;
2201                 }
2202                 emms();
2203         }
2204 }
2205 /* End GCC_ASMBLIT */
2206
2207 #elif MSVC_ASMBLIT
2208 /* fast RGB565->RGB565 blending with surface alpha */
2209 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2210 {
2211         unsigned alpha = info->src->alpha;
2212         if(alpha == 128) {
2213                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2214         } else {
2215                 int width = info->d_width;
2216                 int height = info->d_height;
2217                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2218                 int srcskip = info->s_skip >> 1;
2219                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2220                 int dstskip = info->d_skip >> 1;
2221                 Uint32 s, d;
2222           
2223                 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2224
2225                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2226                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2227                 alpha >>= 3;            /* downscale alpha to 5 bits */
2228
2229                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2230                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2231                 /* position alpha to allow for mullo and mulhi on diff channels
2232                    to reduce the number of operations */
2233                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2234           
2235                 /* Setup the 565 color channel masks */
2236                 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2237                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2238                 
2239                 while(height--) {
2240                         DUFFS_LOOP_QUATRO2(
2241                         {
2242                                 s = *srcp++;
2243                                 d = *dstp;
2244                                 /*
2245                                  * shift out the middle component (green) to
2246                                  * the high 16 bits, and process all three RGB
2247                                  * components at the same time.
2248                                  */
2249                                 s = (s | s << 16) & 0x07e0f81f;
2250                                 d = (d | d << 16) & 0x07e0f81f;
2251                                 d += (s - d) * alpha >> 5;
2252                                 d &= 0x07e0f81f;
2253                                 *dstp++ = (Uint16)(d | d >> 16);
2254                         },{
2255                                 s = *srcp++;
2256                                 d = *dstp;
2257                                 /*
2258                                  * shift out the middle component (green) to
2259                                  * the high 16 bits, and process all three RGB
2260                                  * components at the same time.
2261                                  */
2262                                 s = (s | s << 16) & 0x07e0f81f;
2263                                 d = (d | d << 16) & 0x07e0f81f;
2264                                 d += (s - d) * alpha >> 5;
2265                                 d &= 0x07e0f81f;
2266                                 *dstp++ = (Uint16)(d | d >> 16);
2267                                 s = *srcp++;
2268                                 d = *dstp;
2269                                 /*
2270                                  * shift out the middle component (green) to
2271                                  * the high 16 bits, and process all three RGB
2272                                  * components at the same time.
2273                                  */
2274                                 s = (s | s << 16) & 0x07e0f81f;
2275                                 d = (d | d << 16) & 0x07e0f81f;
2276                                 d += (s - d) * alpha >> 5;
2277                                 d &= 0x07e0f81f;
2278                                 *dstp++ = (Uint16)(d | d >> 16);
2279                         },{
2280                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2281                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2282
2283                                 /* red */
2284                                 src2 = src1;
2285                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2286
2287                                 dst2 = dst1;
2288                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2289
2290                                 /* blend */
2291                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2292                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2293                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2294                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2295                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2296
2297                                 mm_res = dst2; /* RED -> mm_res */
2298
2299                                 /* green -- process the bits in place */
2300                                 src2 = src1;
2301                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2302
2303                                 dst2 = dst1;
2304                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2305
2306                                 /* blend */
2307                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2308                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2309                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2310                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2311
2312                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2313
2314                                 /* blue */
2315                                 src2 = src1;
2316                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2317
2318                                 dst2 = dst1;
2319                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2320
2321                                 /* blend */
2322                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2323                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2324                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2325                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2326                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2327
2328                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2329
2330                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2331
2332                                 srcp += 4;
2333                                 dstp += 4;
2334                         }, width);                      
2335                         srcp += srcskip;
2336                         dstp += dstskip;
2337                 }
2338                 _mm_empty();
2339         }
2340 }
2341
2342 /* fast RGB555->RGB555 blending with surface alpha */
2343 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2344 {
2345         unsigned alpha = info->src->alpha;
2346         if(alpha == 128) {
2347                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2348         } else {
2349                 int width = info->d_width;
2350                 int height = info->d_height;
2351                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2352                 int srcskip = info->s_skip >> 1;
2353                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2354                 int dstskip = info->d_skip >> 1;
2355                 Uint32 s, d;
2356           
2357                 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2358
2359                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2360                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2361                 alpha >>= 3;            /* downscale alpha to 5 bits */
2362
2363                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2364                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2365                 /* position alpha to allow for mullo and mulhi on diff channels
2366                    to reduce the number of operations */
2367                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2368           
2369                 /* Setup the 555 color channel masks */
2370                 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2371                 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2372                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2373
2374                 while(height--) {
2375                         DUFFS_LOOP_QUATRO2(
2376                         {
2377                                 s = *srcp++;
2378                                 d = *dstp;
2379                                 /*
2380                                  * shift out the middle component (green) to
2381                                  * the high 16 bits, and process all three RGB
2382                                  * components at the same time.
2383                                  */
2384                                 s = (s | s << 16) & 0x03e07c1f;
2385                                 d = (d | d << 16) & 0x03e07c1f;
2386                                 d += (s - d) * alpha >> 5;
2387                                 d &= 0x03e07c1f;
2388                                 *dstp++ = (Uint16)(d | d >> 16);
2389                         },{
2390                                 s = *srcp++;
2391                                 d = *dstp;
2392                                 /*
2393                                  * shift out the middle component (green) to
2394                                  * the high 16 bits, and process all three RGB
2395                                  * components at the same time.
2396                                  */
2397                                 s = (s | s << 16) & 0x03e07c1f;
2398                                 d = (d | d << 16) & 0x03e07c1f;
2399                                 d += (s - d) * alpha >> 5;
2400                                 d &= 0x03e07c1f;
2401                                 *dstp++ = (Uint16)(d | d >> 16);
2402                                 s = *srcp++;
2403                                 d = *dstp;
2404                                 /*
2405                                  * shift out the middle component (green) to
2406                                  * the high 16 bits, and process all three RGB
2407                                  * components at the same time.
2408                                  */
2409                                 s = (s | s << 16) & 0x03e07c1f;
2410                                 d = (d | d << 16) & 0x03e07c1f;
2411                                 d += (s - d) * alpha >> 5;
2412                                 d &= 0x03e07c1f;
2413                                 *dstp++ = (Uint16)(d | d >> 16);
2414                         },{
2415                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2416                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2417
2418                                 /* red -- process the bits in place */
2419                                 src2 = src1;
2420                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2421
2422                                 dst2 = dst1;
2423                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2424
2425                                 /* blend */
2426                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2427                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2428                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2429                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2430                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2431
2432                                 mm_res = dst2; /* RED -> mm_res */
2433                                 
2434                                 /* green -- process the bits in place */
2435                                 src2 = src1;
2436                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2437
2438                                 dst2 = dst1;
2439                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2440
2441                                 /* blend */
2442                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2443                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2444                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2445                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2446
2447                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2448
2449                                 /* blue */
2450                                 src2 = src1; /* src -> src2 */
2451                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2452
2453                                 dst2 = dst1; /* dst -> dst2 */
2454                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2455
2456                                 /* blend */
2457                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2458                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2459                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2460                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2461                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2462
2463                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2464
2465                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2466
2467                                 srcp += 4;
2468                                 dstp += 4;
2469                         }, width);                      
2470                         srcp += srcskip;
2471                         dstp += dstskip;
2472                 }
2473                 _mm_empty();
2474         }
2475 }
2476 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2477
2478 /* fast RGB565->RGB565 blending with surface alpha */
2479 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2480 {
2481         unsigned alpha = info->src->alpha;
2482         if(alpha == 128) {
2483                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2484         } else {
2485                 int width = info->d_width;
2486                 int height = info->d_height;
2487                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2488                 int srcskip = info->s_skip >> 1;
2489                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2490                 int dstskip = info->d_skip >> 1;
2491                 alpha >>= 3;    /* downscale alpha to 5 bits */
2492
2493                 while(height--) {
2494                         DUFFS_LOOP4({
2495                                 Uint32 s = *srcp++;
2496                                 Uint32 d = *dstp;
2497                                 /*
2498                                  * shift out the middle component (green) to
2499                                  * the high 16 bits, and process all three RGB
2500                                  * components at the same time.
2501                                  */
2502                                 s = (s | s << 16) & 0x07e0f81f;
2503                                 d = (d | d << 16) & 0x07e0f81f;
2504                                 d += (s - d) * alpha >> 5;
2505                                 d &= 0x07e0f81f;
2506                                 *dstp++ = (Uint16)(d | d >> 16);
2507                         }, width);
2508                         srcp += srcskip;
2509                         dstp += dstskip;
2510                 }
2511         }
2512 }
2513
2514 /* fast RGB555->RGB555 blending with surface alpha */
2515 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2516 {
2517         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2518         if(alpha == 128) {
2519                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2520         } else {
2521                 int width = info->d_width;
2522                 int height = info->d_height;
2523                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2524                 int srcskip = info->s_skip >> 1;
2525                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2526                 int dstskip = info->d_skip >> 1;
2527                 alpha >>= 3;            /* downscale alpha to 5 bits */
2528
2529                 while(height--) {
2530                         DUFFS_LOOP4({
2531                                 Uint32 s = *srcp++;
2532                                 Uint32 d = *dstp;
2533                                 /*
2534                                  * shift out the middle component (green) to
2535                                  * the high 16 bits, and process all three RGB
2536                                  * components at the same time.
2537                                  */
2538                                 s = (s | s << 16) & 0x03e07c1f;
2539                                 d = (d | d << 16) & 0x03e07c1f;
2540                                 d += (s - d) * alpha >> 5;
2541                                 d &= 0x03e07c1f;
2542                                 *dstp++ = (Uint16)(d | d >> 16);
2543                         }, width);
2544                         srcp += srcskip;
2545                         dstp += dstskip;
2546                 }
2547         }
2548 }
2549
2550 /* fast ARGB8888->RGB565 blending with pixel alpha */
2551 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2552 {
2553         int width = info->d_width;
2554         int height = info->d_height;
2555         Uint32 *srcp = (Uint32 *)info->s_pixels;
2556         int srcskip = info->s_skip >> 2;
2557         Uint16 *dstp = (Uint16 *)info->d_pixels;
2558         int dstskip = info->d_skip >> 1;
2559
2560         while(height--) {
2561             DUFFS_LOOP4({
2562                 Uint32 s = *srcp;
2563                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2564                 /* FIXME: Here we special-case opaque alpha since the
2565                    compositioning used (>>8 instead of /255) doesn't handle
2566                    it correctly. Also special-case alpha=0 for speed?
2567                    Benchmark this! */
2568                 if(alpha) {   
2569                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2570                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2571                   } else {
2572                     Uint32 d = *dstp;
2573                     /*
2574                      * convert source and destination to G0RAB65565
2575                      * and blend all components at the same time
2576                      */
2577                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2578                       + (s >> 3 & 0x1f);
2579                     d = (d | d << 16) & 0x07e0f81f;
2580                     d += (s - d) * alpha >> 5;
2581                     d &= 0x07e0f81f;
2582                     *dstp = (Uint16)(d | d >> 16);
2583                   }
2584                 }
2585                 srcp++;
2586                 dstp++;
2587             }, width);
2588             srcp += srcskip;
2589             dstp += dstskip;
2590         }
2591 }
2592
2593 /* fast ARGB8888->RGB555 blending with pixel alpha */
2594 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2595 {
2596         int width = info->d_width;
2597         int height = info->d_height;
2598         Uint32 *srcp = (Uint32 *)info->s_pixels;
2599         int srcskip = info->s_skip >> 2;
2600         Uint16 *dstp = (Uint16 *)info->d_pixels;
2601         int dstskip = info->d_skip >> 1;
2602
2603         while(height--) {
2604             DUFFS_LOOP4({
2605                 unsigned alpha;
2606                 Uint32 s = *srcp;
2607                 alpha = s >> 27; /* downscale alpha to 5 bits */
2608                 /* FIXME: Here we special-case opaque alpha since the
2609                    compositioning used (>>8 instead of /255) doesn't handle
2610                    it correctly. Also special-case alpha=0 for speed?
2611                    Benchmark this! */
2612                 if(alpha) {   
2613                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2614                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2615                   } else {
2616                     Uint32 d = *dstp;
2617                     /*
2618                      * convert source and destination to G0RAB65565
2619                      * and blend all components at the same time
2620                      */
2621                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2622                       + (s >> 3 & 0x1f);
2623                     d = (d | d << 16) & 0x03e07c1f;
2624                     d += (s - d) * alpha >> 5;
2625                     d &= 0x03e07c1f;
2626                     *dstp = (Uint16)(d | d >> 16);
2627                   }
2628                 }
2629                 srcp++;
2630                 dstp++;
2631             }, width);
2632             srcp += srcskip;
2633             dstp += dstskip;
2634         }
2635 }
2636
2637 /* General (slow) N->N blending with per-surface alpha */
2638 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2639 {
2640         int width = info->d_width;
2641         int height = info->d_height;
2642         Uint8 *src = info->s_pixels;
2643         int srcskip = info->s_skip;
2644         Uint8 *dst = info->d_pixels;
2645         int dstskip = info->d_skip;
2646         SDL_PixelFormat *srcfmt = info->src;
2647         SDL_PixelFormat *dstfmt = info->dst;
2648         int srcbpp = srcfmt->BytesPerPixel;
2649         int dstbpp = dstfmt->BytesPerPixel;
2650         unsigned sA = srcfmt->alpha;
2651         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2652
2653         if(sA) {
2654           while ( height-- ) {
2655             DUFFS_LOOP4(
2656             {
2657                 Uint32 Pixel;
2658                 unsigned sR;
2659                 unsigned sG;
2660                 unsigned sB;
2661                 unsigned dR;
2662                 unsigned dG;
2663                 unsigned dB;
2664                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2665                 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2666                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2667                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2668                 src += srcbpp;
2669                 dst += dstbpp;
2670             },
2671             width);
2672             src += srcskip;
2673             dst += dstskip;
2674           }
2675         }
2676 }
2677
2678 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2679 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2680 {
2681         int width = info->d_width;
2682         int height = info->d_height;
2683         Uint8 *src = info->s_pixels;
2684         int srcskip = info->s_skip;
2685         Uint8 *dst = info->d_pixels;
2686         int dstskip = info->d_skip;
2687         SDL_PixelFormat *srcfmt = info->src;
2688         SDL_PixelFormat *dstfmt = info->dst;
2689         Uint32 ckey = srcfmt->colorkey;
2690         int srcbpp = srcfmt->BytesPerPixel;
2691         int dstbpp = dstfmt->BytesPerPixel;
2692         unsigned sA = srcfmt->alpha;
2693         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2694
2695         if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2696             Uint16 *src16 = (Uint16 *)src;
2697             Uint16 *dst16 = (Uint16 *)dst;
2698             sA >>= 3;   /* downscale alpha to 5 bits */
2699             while ( height-- ) {
2700                 DUFFS_LOOP4(
2701                 {
2702                     Uint32 s;
2703                     Uint32 d;
2704                     s = *src16;
2705                     if(sA && s != ckey) {
2706                         d = *dst16;
2707                         s = (s | s << 16) & 0x07e0f81f;
2708                         d = (d | d << 16) & 0x07e0f81f;
2709                         d += (s - d) * sA >> 5;
2710                         d &= 0x07e0f81f;
2711                         *dst16 = (Uint16)(d | d >> 16);
2712                     }
2713                     src16++;
2714                     dst16++;
2715                 },
2716                 width);
2717                 src16 += srcskip / 2;
2718                 dst16 += dstskip / 2;
2719             }
2720             return;
2721         }
2722
2723         while ( height-- ) {
2724             DUFFS_LOOP4(
2725             {
2726                 Uint32 Pixel;
2727                 unsigned sR;
2728                 unsigned sG;
2729                 unsigned sB;
2730                 unsigned dR;
2731                 unsigned dG;
2732                 unsigned dB;
2733                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2734                 if(sA && Pixel != ckey) {
2735                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2736                     DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2737                     ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2738                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2739                 }
2740                 src += srcbpp;
2741                 dst += dstbpp;
2742             },
2743             width);
2744             src += srcskip;
2745             dst += dstskip;
2746         }
2747 }
2748
2749 /* General (slow) N->N blending with pixel alpha */
2750 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2751 {
2752         int width = info->d_width;
2753         int height = info->d_height;
2754         Uint8 *src = info->s_pixels;
2755         int srcskip = info->s_skip;
2756         Uint8 *dst = info->d_pixels;
2757         int dstskip = info->d_skip;
2758         SDL_PixelFormat *srcfmt = info->src;
2759         SDL_PixelFormat *dstfmt = info->dst;
2760
2761         int  srcbpp;
2762         int  dstbpp;
2763
2764         /* Set up some basic variables */
2765         srcbpp = srcfmt->BytesPerPixel;
2766         dstbpp = dstfmt->BytesPerPixel;
2767
2768         /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2769            quite right. for <8bpp source alpha, it gets them very wrong
2770            (check all macros!)
2771            It is unclear whether there is a good general solution that doesn't
2772            need a branch (or a divide). */
2773         while ( height-- ) {
2774             DUFFS_LOOP4(
2775             {
2776                 Uint32 Pixel;
2777                 unsigned sR;
2778                 unsigned sG;
2779                 unsigned sB;
2780                 unsigned dR;
2781                 unsigned dG;
2782                 unsigned dB;
2783                 unsigned sA;
2784                 unsigned dA;
2785                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2786                 if(sA) {
2787                   DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2788                   ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2789                   ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2790                 }
2791                 src += srcbpp;
2792                 dst += dstbpp;
2793             },
2794             width);
2795             src += srcskip;
2796             dst += dstskip;
2797         }
2798 }
2799
2800
2801 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2802 {
2803     SDL_PixelFormat *sf = surface->format;
2804     SDL_PixelFormat *df = surface->map->dst->format;
2805
2806     if(sf->Amask == 0) {
2807         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2808             if(df->BytesPerPixel == 1)
2809                 return BlitNto1SurfaceAlphaKey;
2810             else
2811 #if SDL_ALTIVEC_BLITTERS
2812         if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2813             !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2814             return Blit32to32SurfaceAlphaKeyAltivec;
2815         else
2816 #endif
2817             return BlitNtoNSurfaceAlphaKey;
2818         } else {
2819             /* Per-surface alpha blits */
2820             switch(df->BytesPerPixel) {
2821             case 1:
2822                 return BlitNto1SurfaceAlpha;
2823
2824             case 2:
2825                 if(surface->map->identity) {
2826                     if(df->Gmask == 0x7e0)
2827                     {
2828 #if MMX_ASMBLIT
2829                 if(SDL_HasMMX())
2830                         return Blit565to565SurfaceAlphaMMX;
2831                 else
2832 #endif
2833                         return Blit565to565SurfaceAlpha;
2834                     }
2835                     else if(df->Gmask == 0x3e0)
2836                     {
2837 #if MMX_ASMBLIT
2838                 if(SDL_HasMMX())
2839                         return Blit555to555SurfaceAlphaMMX;
2840                 else
2841 #endif
2842                         return Blit555to555SurfaceAlpha;
2843                     }
2844                 }
2845                 return BlitNtoNSurfaceAlpha;
2846
2847             case 4:
2848                 if(sf->Rmask == df->Rmask
2849                    && sf->Gmask == df->Gmask
2850                    && sf->Bmask == df->Bmask
2851                    && sf->BytesPerPixel == 4)
2852                 {
2853 #if MMX_ASMBLIT
2854                         if(sf->Rshift % 8 == 0
2855                            && sf->Gshift % 8 == 0
2856                            && sf->Bshift % 8 == 0
2857                            && SDL_HasMMX())
2858                             return BlitRGBtoRGBSurfaceAlphaMMX;
2859 #endif
2860 #ifdef __ARM_NEON__
2861                         if(sf->Rshift % 8 == 0
2862                            && sf->Gshift % 8 == 0
2863                            && sf->Bshift % 8 == 0)
2864                         {
2865                                 return BlitARGBtoXRGBalphaS_neon;
2866                         }
2867 #endif
2868                         if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2869                         {
2870 #if SDL_ALTIVEC_BLITTERS
2871                                 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2872                                         && SDL_HasAltiVec())
2873                                         return BlitRGBtoRGBSurfaceAlphaAltivec;
2874 #endif
2875                                 return BlitRGBtoRGBSurfaceAlpha;
2876                         }
2877                 }
2878 #ifdef __ARM_NEON__
2879                 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2880                     && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2881                 {
2882                         return BlitABGRtoXRGBalphaS_neon;
2883                 }
2884 #endif
2885 #if SDL_ALTIVEC_BLITTERS
2886                 if((sf->BytesPerPixel == 4) &&
2887                    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2888                         return Blit32to32SurfaceAlphaAltivec;
2889                 else
2890 #endif
2891                         return BlitNtoNSurfaceAlpha;
2892
2893             case 3:
2894             default:
2895                 return BlitNtoNSurfaceAlpha;
2896             }
2897         }
2898     } else {
2899         /* Per-pixel alpha blits */
2900         switch(df->BytesPerPixel) {
2901         case 1:
2902             return BlitNto1PixelAlpha;
2903
2904         case 2:
2905 #if SDL_ALTIVEC_BLITTERS
2906         if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2907            df->Gmask == 0x7e0 &&
2908            df->Bmask == 0x1f && SDL_HasAltiVec())
2909             return Blit32to565PixelAlphaAltivec;
2910         else
2911 #endif
2912 #ifdef __ARM_NEON__
2913             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2914                && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2915                 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2916                     return BlitARGBtoRGB565alpha_neon;
2917                 else
2918                     return BlitABGRtoRGB565alpha_neon;
2919             }
2920             else
2921 #endif
2922             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2923                && sf->Gmask == 0xff00
2924                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2925                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2926                 if(df->Gmask == 0x7e0)
2927                     return BlitARGBto565PixelAlpha;
2928                 else if(df->Gmask == 0x3e0)
2929                     return BlitARGBto555PixelAlpha;
2930             }
2931             return BlitNtoNPixelAlpha;
2932
2933         case 4:
2934             if(sf->Rmask == df->Rmask
2935                && sf->Gmask == df->Gmask
2936                && sf->Bmask == df->Bmask
2937                && sf->BytesPerPixel == 4)
2938             {
2939 #if MMX_ASMBLIT
2940                 if(sf->Rshift % 8 == 0
2941                    && sf->Gshift % 8 == 0
2942                    && sf->Bshift % 8 == 0
2943                    && sf->Ashift % 8 == 0
2944                    && sf->Aloss == 0)
2945                 {
2946                         if(SDL_Has3DNow())
2947                                 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2948                         if(SDL_HasMMX())
2949                                 return BlitRGBtoRGBPixelAlphaMMX;
2950                 }
2951 #endif
2952 #ifdef __ARM_NEON__
2953                 if(sf->Rshift % 8 == 0
2954                    && sf->Gshift % 8 == 0
2955                    && sf->Bshift % 8 == 0
2956                    && sf->Ashift % 8 == 0)
2957                 {
2958                         return BlitARGBtoXRGBalpha_neon;
2959                 }
2960 #endif
2961                 if(sf->Amask == 0xff000000)
2962                 {
2963 #if SDL_ALTIVEC_BLITTERS
2964                         if(!(surface->map->dst->flags & SDL_HWSURFACE)
2965                                 && SDL_HasAltiVec())
2966                                 return BlitRGBtoRGBPixelAlphaAltivec;
2967 #endif
2968                         return BlitRGBtoRGBPixelAlpha;
2969                 }
2970             }
2971 #ifdef __ARM_NEON__
2972             if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2973                 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2974                 && sf->Amask == 0xff000000)
2975             {
2976                 return BlitABGRtoXRGBalpha_neon;
2977             }
2978 #endif
2979 #if SDL_ALTIVEC_BLITTERS
2980             if (sf->Amask && sf->BytesPerPixel == 4 &&
2981                 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2982                 return Blit32to32PixelAlphaAltivec;
2983             else
2984 #endif
2985                 return BlitNtoNPixelAlpha;
2986
2987         case 3:
2988         default:
2989             return BlitNtoNPixelAlpha;
2990         }
2991     }
2992 }
2993