NEONize a few more blit types
[sdl_omap.git] / src / video / SDL_blit_A.c
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2009 Sam Lantinga
4
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 #include "SDL_video.h"
25 #include "SDL_blit.h"
26
27 /*
28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29    Checking if _mm_free is #defined in malloc.h is is the only way to
30    determine if the Processor Pack is installed, as far as I can tell.
31 */
32
33 #if SDL_ASSEMBLY_ROUTINES
34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 #    define MMX_ASMBLIT 1
36 #    define GCC_ASMBLIT 1
37 #  elif defined(_MSC_VER) && defined(_M_IX86)
38 #    if (_MSC_VER <= 1200)  
39 #      include <malloc.h>   
40 #      if defined(_mm_free)
41 #          define HAVE_MMINTRIN_H 1
42 #      endif
43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
44 #      define HAVE_MMINTRIN_H 1
45 #    endif
46 #    if HAVE_MMINTRIN_H
47 #      define MMX_ASMBLIT 1
48 #      define MSVC_ASMBLIT 1
49 #    endif
50 #  endif
51 #endif /* SDL_ASSEMBLY_ROUTINES */
52
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
55 #if GCC_ASMBLIT
56 #include "mmx.h"
57 #elif MSVC_ASMBLIT
58 #include <mmintrin.h>
59 #include <mm3dnow.h>
60 #endif
61
62 /* Functions to perform alpha blended blitting */
63
64 #ifdef __ARM_NEON__
65
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
70 { \
71         int width = info->d_width; \
72         int height = info->d_height; \
73         Uint8 *src = info->s_pixels; \
74         Uint8 *dst = info->d_pixels; \
75         int dstBpp = info->dst->BytesPerPixel; \
76         int srcstride = width * 4 + info->s_skip; \
77         int dststride = width * dstBpp + info->d_skip; \
78 \
79         while ( height-- ) { \
80             neon_name(dst, src, width); \
81             src += srcstride; \
82             dst += dststride; \
83         } \
84 }
85
86 #define make_neon_callerS(name, neon_name) \
87 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
88 static void name(SDL_BlitInfo *info) \
89 { \
90         int width = info->d_width; \
91         int height = info->d_height; \
92         Uint8 *src = info->s_pixels; \
93         Uint8 *dst = info->d_pixels; \
94         int srcskip = info->s_skip; \
95         int dstskip = info->d_skip; \
96         unsigned alpha = info->src->alpha;\
97 \
98         while ( height-- ) { \
99             neon_name(dst, src, width, alpha); \
100             src += width * 4 + srcskip; \
101             dst += width * 4 + dstskip; \
102         } \
103 }
104
105 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
106 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
107 make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
108 make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
109 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
110 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
111
112 #endif /* __ARM_NEON__ */
113
114 /* N->1 blending with per-surface alpha */
115 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
116 {
117         int width = info->d_width;
118         int height = info->d_height;
119         Uint8 *src = info->s_pixels;
120         int srcskip = info->s_skip;
121         Uint8 *dst = info->d_pixels;
122         int dstskip = info->d_skip;
123         Uint8 *palmap = info->table;
124         SDL_PixelFormat *srcfmt = info->src;
125         SDL_PixelFormat *dstfmt = info->dst;
126         int srcbpp = srcfmt->BytesPerPixel;
127
128         const unsigned A = srcfmt->alpha;
129
130         while ( height-- ) {
131             DUFFS_LOOP4(
132             {
133                 Uint32 Pixel;
134                 unsigned sR;
135                 unsigned sG;
136                 unsigned sB;
137                 unsigned dR;
138                 unsigned dG;
139                 unsigned dB;
140                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
141                 dR = dstfmt->palette->colors[*dst].r;
142                 dG = dstfmt->palette->colors[*dst].g;
143                 dB = dstfmt->palette->colors[*dst].b;
144                 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
145                 dR &= 0xff;
146                 dG &= 0xff;
147                 dB &= 0xff;
148                 /* Pack RGB into 8bit pixel */
149                 if ( palmap == NULL ) {
150                     *dst =((dR>>5)<<(3+2))|
151                           ((dG>>5)<<(2))|
152                           ((dB>>6)<<(0));
153                 } else {
154                     *dst = palmap[((dR>>5)<<(3+2))|
155                                   ((dG>>5)<<(2))  |
156                                   ((dB>>6)<<(0))];
157                 }
158                 dst++;
159                 src += srcbpp;
160             },
161             width);
162             src += srcskip;
163             dst += dstskip;
164         }
165 }
166
167 /* N->1 blending with pixel alpha */
168 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
169 {
170         int width = info->d_width;
171         int height = info->d_height;
172         Uint8 *src = info->s_pixels;
173         int srcskip = info->s_skip;
174         Uint8 *dst = info->d_pixels;
175         int dstskip = info->d_skip;
176         Uint8 *palmap = info->table;
177         SDL_PixelFormat *srcfmt = info->src;
178         SDL_PixelFormat *dstfmt = info->dst;
179         int srcbpp = srcfmt->BytesPerPixel;
180
181         /* FIXME: fix alpha bit field expansion here too? */
182         while ( height-- ) {
183             DUFFS_LOOP4(
184             {
185                 Uint32 Pixel;
186                 unsigned sR;
187                 unsigned sG;
188                 unsigned sB;
189                 unsigned sA;
190                 unsigned dR;
191                 unsigned dG;
192                 unsigned dB;
193                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
194                 dR = dstfmt->palette->colors[*dst].r;
195                 dG = dstfmt->palette->colors[*dst].g;
196                 dB = dstfmt->palette->colors[*dst].b;
197                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
198                 dR &= 0xff;
199                 dG &= 0xff;
200                 dB &= 0xff;
201                 /* Pack RGB into 8bit pixel */
202                 if ( palmap == NULL ) {
203                     *dst =((dR>>5)<<(3+2))|
204                           ((dG>>5)<<(2))|
205                           ((dB>>6)<<(0));
206                 } else {
207                     *dst = palmap[((dR>>5)<<(3+2))|
208                                   ((dG>>5)<<(2))  |
209                                   ((dB>>6)<<(0))  ];
210                 }
211                 dst++;
212                 src += srcbpp;
213             },
214             width);
215             src += srcskip;
216             dst += dstskip;
217         }
218 }
219
220 /* colorkeyed N->1 blending with per-surface alpha */
221 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
222 {
223         int width = info->d_width;
224         int height = info->d_height;
225         Uint8 *src = info->s_pixels;
226         int srcskip = info->s_skip;
227         Uint8 *dst = info->d_pixels;
228         int dstskip = info->d_skip;
229         Uint8 *palmap = info->table;
230         SDL_PixelFormat *srcfmt = info->src;
231         SDL_PixelFormat *dstfmt = info->dst;
232         int srcbpp = srcfmt->BytesPerPixel;
233         Uint32 ckey = srcfmt->colorkey;
234
235         const int A = srcfmt->alpha;
236
237         while ( height-- ) {
238             DUFFS_LOOP(
239             {
240                 Uint32 Pixel;
241                 unsigned sR;
242                 unsigned sG;
243                 unsigned sB;
244                 unsigned dR;
245                 unsigned dG;
246                 unsigned dB;
247                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
248                 if ( Pixel != ckey ) {
249                     dR = dstfmt->palette->colors[*dst].r;
250                     dG = dstfmt->palette->colors[*dst].g;
251                     dB = dstfmt->palette->colors[*dst].b;
252                     ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
253                     dR &= 0xff;
254                     dG &= 0xff;
255                     dB &= 0xff;
256                     /* Pack RGB into 8bit pixel */
257                     if ( palmap == NULL ) {
258                         *dst =((dR>>5)<<(3+2))|
259                               ((dG>>5)<<(2)) |
260                               ((dB>>6)<<(0));
261                     } else {
262                         *dst = palmap[((dR>>5)<<(3+2))|
263                                       ((dG>>5)<<(2))  |
264                                       ((dB>>6)<<(0))  ];
265                     }
266                 }
267                 dst++;
268                 src += srcbpp;
269             },
270             width);
271             src += srcskip;
272             dst += dstskip;
273         }
274 }
275
276 #if GCC_ASMBLIT
277 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
278 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
279 {
280         int width = info->d_width;
281         int height = info->d_height;
282         Uint32 *srcp = (Uint32 *)info->s_pixels;
283         int srcskip = info->s_skip >> 2;
284         Uint32 *dstp = (Uint32 *)info->d_pixels;
285         int dstskip = info->d_skip >> 2;
286         Uint32 dalpha = info->dst->Amask;
287         Uint64 load;
288
289         load = 0x00fefefe00fefefeULL;/* alpha128 mask */
290         movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
291         load = 0x0001010100010101ULL;/* !alpha128 mask */
292         movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
293         movd_m2r(dalpha, mm7); /* dst alpha mask */
294         punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
295         while(height--) {
296                 DUFFS_LOOP_DOUBLE2(
297                 {
298                         Uint32 s = *srcp++;
299                         Uint32 d = *dstp;
300                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
301                                    + (s & d & 0x00010101)) | dalpha;
302                 },{
303                         movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
304                         movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
305
306                         movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
307                         movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
308
309                         pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
310                         pand_r2r(mm4, mm5); /* src & mask -> mm5 */
311                         paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
312                         pand_r2r(mm1, mm2); /* src & dst -> mm2 */
313                         psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
314                         pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
315                         paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
316                         
317                         por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
318                         movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
319                         dstp += 2;
320                         srcp += 2;
321                 }, width);
322                 srcp += srcskip;
323                 dstp += dstskip;
324         }
325         emms();
326 }
327
328 /* fast RGB888->(A)RGB888 blending with surface alpha */
329 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
330 {
331         SDL_PixelFormat* df = info->dst;
332         unsigned alpha = info->src->alpha;
333
334         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
335                         /* only call a128 version when R,G,B occupy lower bits */
336                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
337         } else {
338                 int width = info->d_width;
339                 int height = info->d_height;
340                 Uint32 *srcp = (Uint32 *)info->s_pixels;
341                 int srcskip = info->s_skip >> 2;
342                 Uint32 *dstp = (Uint32 *)info->d_pixels;
343                 int dstskip = info->d_skip >> 2;
344
345                 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
346                 /* form the alpha mult */
347                 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
348                 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
349                 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
350                 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
351                 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
352                 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
353                 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
354                         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
355                 movd_m2r(df->Amask, mm7); /* dst alpha mask */
356                 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
357                 
358                 while(height--) {
359                         DUFFS_LOOP_DOUBLE2({
360                                 /* One Pixel Blend */
361                                 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
362                                 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
363                                 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
364                                 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
365
366                                 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
367                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
368                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
369                                 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
370
371                                 packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
372                                 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
373                                 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
374                                 ++srcp;
375                                 ++dstp;
376                         },{
377                                 /* Two Pixels Blend */
378                                 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
379                                 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
380                                 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
381                                 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
382
383                                 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
384                                 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
385                                 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
386                                 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
387
388                                 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
389                                 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
390                                 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
391                                 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
392
393                                 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
394                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
395                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
396                                 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
397
398                                 packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
399                                 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
400                                 
401                                 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
402
403                                 srcp += 2;
404                                 dstp += 2;
405                         }, width);
406                         srcp += srcskip;
407                         dstp += dstskip;
408                 }
409                 emms();
410         }
411 }
412
413 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
414 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
415 {
416         int width = info->d_width;
417         int height = info->d_height;
418         Uint32 *srcp = (Uint32 *)info->s_pixels;
419         int srcskip = info->s_skip >> 2;
420         Uint32 *dstp = (Uint32 *)info->d_pixels;
421         int dstskip = info->d_skip >> 2;
422         SDL_PixelFormat* sf = info->src;
423         Uint32 amask = sf->Amask;
424
425         pxor_r2r(mm6, mm6); /* 0 -> mm6 */
426         /* form multiplication mask */
427         movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
428         punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
429         pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
430         movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
431         pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
432         /* form channel masks */
433         movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
434         packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
435         packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
436         pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
437         /* get alpha channel shift */
438         __asm__ __volatile__ (
439                 "movd %0, %%mm5"
440                 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
441
442         while(height--) {
443             DUFFS_LOOP4({
444                 Uint32 alpha = *srcp & amask;
445                 /* FIXME: Here we special-case opaque alpha since the
446                         compositioning used (>>8 instead of /255) doesn't handle
447                         it correctly. Also special-case alpha=0 for speed?
448                         Benchmark this! */
449                 if(alpha == 0) {
450                         /* do nothing */
451                 } else if(alpha == amask) {
452                         /* opaque alpha -- copy RGB, keep dst alpha */
453                         /* using MMX here to free up regular registers for other things */
454                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
455                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
456                         pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
457                         pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
458                         por_r2r(mm1, mm2); /* src | dst -> mm2 */
459                         movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
460                 } else {
461                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
462                         punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
463
464                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
465                         punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
466
467                         __asm__ __volatile__ (
468                                 "movd %0, %%mm4"
469                                 : : "r" (alpha) ); /* 0000A000 -> mm4 */
470                         psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
471                         punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
472                         punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
473                         pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
474
475                         /* blend */                 
476                         psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
477                         pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
478                         psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
479                         paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
480                         
481                         packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
482                         movd_r2m(mm2, *dstp);/* mm2 -> dst */
483                 }
484                 ++srcp;
485                 ++dstp;
486             }, width);
487             srcp += srcskip;
488             dstp += dstskip;
489         }
490         emms();
491 }
492 /* End GCC_ASMBLIT */
493
494 #elif MSVC_ASMBLIT
495 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
496 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
497 {
498         int width = info->d_width;
499         int height = info->d_height;
500         Uint32 *srcp = (Uint32 *)info->s_pixels;
501         int srcskip = info->s_skip >> 2;
502         Uint32 *dstp = (Uint32 *)info->d_pixels;
503         int dstskip = info->d_skip >> 2;
504         Uint32 dalpha = info->dst->Amask;
505
506         __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
507         
508         hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
509         lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
510         dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
511
512         while (height--) {
513                 int n = width;
514                 if ( n & 1 ) {
515                         Uint32 s = *srcp++;
516                         Uint32 d = *dstp;
517                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
518                                    + (s & d & 0x00010101)) | dalpha;
519                         n--;
520                 }
521                 
522                 for (n >>= 1; n > 0; --n) {
523                         dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
524                         dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
525
526                         src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
527                         src2 = src1; /* 2 x src -> src2(ARGBARGB) */
528
529                         dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
530                         src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
531                         src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
532                         src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
533
534                         dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
535                         dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
536                         dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
537                         dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
538                         
539                         *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
540                         dstp += 2;
541                         srcp += 2;
542                 }
543                 
544                 srcp += srcskip;
545                 dstp += dstskip;
546         }
547         _mm_empty();
548 }
549
550 /* fast RGB888->(A)RGB888 blending with surface alpha */
551 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
552 {
553         SDL_PixelFormat* df = info->dst;
554         Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
555         unsigned alpha = info->src->alpha;
556
557         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
558                         /* only call a128 version when R,G,B occupy lower bits */
559                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
560         } else {
561                 int width = info->d_width;
562                 int height = info->d_height;
563                 Uint32 *srcp = (Uint32 *)info->s_pixels;
564                 int srcskip = info->s_skip >> 2;
565                 Uint32 *dstp = (Uint32 *)info->d_pixels;
566                 int dstskip = info->d_skip >> 2;
567                 Uint32 dalpha = df->Amask;
568                 Uint32 amult;
569
570                 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
571                 
572                 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
573                 /* form the alpha mult */
574                 amult = alpha | (alpha << 8);
575                 amult = amult | (amult << 16);
576                 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
577                 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
578                 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
579                         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
580                 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
581                 
582                 while (height--) {
583                         int n = width;
584                         if (n & 1) {
585                                 /* One Pixel Blend */
586                                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
587                                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
588
589                                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
590                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
591
592                                 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
593                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
594                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
595                                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
596                                 
597                                 dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
598                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
599                                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
600
601                                 ++srcp;
602                                 ++dstp;
603                                 
604                                 n--;
605                         }
606
607                         for (n >>= 1; n > 0; --n) {
608                                 /* Two Pixels Blend */
609                                 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
610                                 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
611                                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
612                                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
613
614                                 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
615                                 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
616                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
617                                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
618
619                                 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
620                                 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
621                                 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
622                                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
623
624                                 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
625                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
626                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
627                                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
628                                 
629                                 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
630                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
631
632                                 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
633
634                                 srcp += 2;
635                                 dstp += 2;
636                         }
637                         srcp += srcskip;
638                         dstp += dstskip;
639                 }
640                 _mm_empty();
641         }
642 }
643
644 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
645 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
646 {
647         int width = info->d_width;
648         int height = info->d_height;
649         Uint32 *srcp = (Uint32 *)info->s_pixels;
650         int srcskip = info->s_skip >> 2;
651         Uint32 *dstp = (Uint32 *)info->d_pixels;
652         int dstskip = info->d_skip >> 2;
653         SDL_PixelFormat* sf = info->src;
654         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
655         Uint32 amask = sf->Amask;
656         Uint32 ashift = sf->Ashift;
657         Uint64 multmask;
658
659         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
660
661         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
662         multmask = ~(0xFFFFi64 << (ashift * 2));
663         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
664
665         while(height--) {
666                 DUFFS_LOOP4({
667                 Uint32 alpha = *srcp & amask;
668                 if (alpha == 0) {
669                         /* do nothing */
670                 } else if (alpha == amask) {
671                         /* opaque alpha -- copy RGB, keep dst alpha */
672                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
673                 } else {
674                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
675                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
676
677                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
678                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
679
680                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
681                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
682                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
683                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
684                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
685
686                         /* blend */                 
687                         src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
688                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
689                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
690                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
691                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
692                         
693                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
694                 }
695                 ++srcp;
696                 ++dstp;
697             }, width);
698             srcp += srcskip;
699             dstp += dstskip;
700         }
701         _mm_empty();
702 }
703 /* End MSVC_ASMBLIT */
704
705 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
706
707 #if SDL_ALTIVEC_BLITTERS
708 #if __MWERKS__
709 #pragma altivec_model on
710 #endif
711 #if HAVE_ALTIVEC_H
712 #include <altivec.h>
713 #endif
714 #include <assert.h>
715
716 #if (defined(__MACOSX__) && (__GNUC__ < 4))
717     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
718         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
719     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
720         (vector unsigned short) ( a,b,c,d,e,f,g,h )
721 #else
722     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
723         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
724     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
725         (vector unsigned short) { a,b,c,d,e,f,g,h }
726 #endif
727
728 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
729 #define VECPRINT(msg, v) do { \
730     vector unsigned int tmpvec = (vector unsigned int)(v); \
731     unsigned int *vp = (unsigned int *)&tmpvec; \
732     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
733 } while (0)
734
735 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
736     (vector unsigned char)(
737         0x00, 0x10, 0x02, 0x12,
738         0x04, 0x14, 0x06, 0x16,
739         0x08, 0x18, 0x0A, 0x1A,
740         0x0C, 0x1C, 0x0E, 0x1E );
741 */
742 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
743 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
744 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
745 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
746     ? vec_lvsl(0, src) \
747     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
748
749    
750 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
751     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
752     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
753     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
754     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
755     /* valpha2 is 255-alpha */ \
756     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
757     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
758     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
759     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
760     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
761     /* add source and dest */ \
762     vtemp1 = vec_add(vtemp1, vtemp3); \
763     vtemp2 = vec_add(vtemp2, vtemp4); \
764     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
765     vtemp1 = vec_add(vtemp1, v1_16); \
766     vtemp3 = vec_sr(vtemp1, v8_16); \
767     vtemp1 = vec_add(vtemp1, vtemp3); \
768     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
769     vtemp2 = vec_add(vtemp2, v1_16); \
770     vtemp4 = vec_sr(vtemp2, v8_16); \
771     vtemp2 = vec_add(vtemp2, vtemp4); \
772     /* (>>8) and get ARGBARGBARGBARGB */ \
773     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
774 } while (0)
775  
776 /* Calculate the permute vector used for 32->32 swizzling */
777 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
778                                   const SDL_PixelFormat *dstfmt)
779 {
780     /*
781      * We have to assume that the bits that aren't used by other
782      *  colors is alpha, and it's one complete byte, since some formats
783      *  leave alpha with a zero mask, but we should still swizzle the bits.
784      */
785     /* ARGB */
786     const static struct SDL_PixelFormat default_pixel_format = {
787         NULL, 0, 0,
788         0, 0, 0, 0,
789         16, 8, 0, 24,
790         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
791         0, 0};
792     if (!srcfmt) {
793         srcfmt = &default_pixel_format;
794     }
795     if (!dstfmt) {
796         dstfmt = &default_pixel_format;
797     }
798     const vector unsigned char plus = VECUINT8_LITERAL
799                                             ( 0x00, 0x00, 0x00, 0x00,
800                                               0x04, 0x04, 0x04, 0x04,
801                                               0x08, 0x08, 0x08, 0x08,
802                                               0x0C, 0x0C, 0x0C, 0x0C );
803     vector unsigned char vswiz;
804     vector unsigned int srcvec;
805 #define RESHIFT(X) (3 - ((X) >> 3))
806     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
807     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
808     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
809     Uint32 amask;
810     /* Use zero for alpha if either surface doesn't have alpha */
811     if (dstfmt->Amask) {
812         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
813     } else {
814         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
815     }
816 #undef RESHIFT  
817     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
818     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
819     return(vswiz);
820 }
821
822 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
823 {
824     int height = info->d_height;
825     Uint8 *src = (Uint8 *)info->s_pixels;
826     int srcskip = info->s_skip;
827     Uint8 *dst = (Uint8 *)info->d_pixels;
828     int dstskip = info->d_skip;
829     SDL_PixelFormat *srcfmt = info->src;
830
831     vector unsigned char v0 = vec_splat_u8(0);
832     vector unsigned short v8_16 = vec_splat_u16(8);
833     vector unsigned short v1_16 = vec_splat_u16(1);
834     vector unsigned short v2_16 = vec_splat_u16(2);
835     vector unsigned short v3_16 = vec_splat_u16(3);
836     vector unsigned int v8_32 = vec_splat_u32(8);
837     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
838     vector unsigned short v3f = VECUINT16_LITERAL(
839         0x003f, 0x003f, 0x003f, 0x003f,
840         0x003f, 0x003f, 0x003f, 0x003f);
841     vector unsigned short vfc = VECUINT16_LITERAL(
842         0x00fc, 0x00fc, 0x00fc, 0x00fc,
843         0x00fc, 0x00fc, 0x00fc, 0x00fc);
844
845     /* 
846         0x10 - 0x1f is the alpha
847         0x00 - 0x0e evens are the red
848         0x01 - 0x0f odds are zero
849     */
850     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
851         0x10, 0x00, 0x01, 0x01,
852         0x10, 0x02, 0x01, 0x01,
853         0x10, 0x04, 0x01, 0x01,
854         0x10, 0x06, 0x01, 0x01
855     );
856     vector unsigned char vredalpha2 = (vector unsigned char)(
857         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
858     );
859     /*
860         0x00 - 0x0f is ARxx ARxx ARxx ARxx
861         0x11 - 0x0f odds are blue
862     */
863     vector unsigned char vblue1 = VECUINT8_LITERAL(
864         0x00, 0x01, 0x02, 0x11,
865         0x04, 0x05, 0x06, 0x13,
866         0x08, 0x09, 0x0a, 0x15,
867         0x0c, 0x0d, 0x0e, 0x17
868     );
869     vector unsigned char vblue2 = (vector unsigned char)(
870         vec_add((vector unsigned int)vblue1, v8_32)
871     );
872     /*
873         0x00 - 0x0f is ARxB ARxB ARxB ARxB
874         0x10 - 0x0e evens are green
875     */
876     vector unsigned char vgreen1 = VECUINT8_LITERAL(
877         0x00, 0x01, 0x10, 0x03,
878         0x04, 0x05, 0x12, 0x07,
879         0x08, 0x09, 0x14, 0x0b,
880         0x0c, 0x0d, 0x16, 0x0f
881     );
882     vector unsigned char vgreen2 = (vector unsigned char)(
883         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
884     );
885     vector unsigned char vgmerge = VECUINT8_LITERAL(
886         0x00, 0x02, 0x00, 0x06,
887         0x00, 0x0a, 0x00, 0x0e,
888         0x00, 0x12, 0x00, 0x16,
889         0x00, 0x1a, 0x00, 0x1e);
890     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
891     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
892     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
893
894     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
895     vf800 = vec_sl(vf800, vec_splat_u16(8));
896
897     while(height--) {
898         int extrawidth;
899         vector unsigned char valigner;
900         vector unsigned char vsrc;
901         vector unsigned char voverflow;
902         int width = info->d_width;
903
904 #define ONE_PIXEL_BLEND(condition, widthvar) \
905         while (condition) { \
906             Uint32 Pixel; \
907             unsigned sR, sG, sB, dR, dG, dB, sA; \
908             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
909             if(sA) { \
910                 unsigned short dstpixel = *((unsigned short *)dst); \
911                 dR = (dstpixel >> 8) & 0xf8; \
912                 dG = (dstpixel >> 3) & 0xfc; \
913                 dB = (dstpixel << 3) & 0xf8; \
914                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
915                 *((unsigned short *)dst) = ( \
916                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
917                 ); \
918             } \
919             src += 4; \
920             dst += 2; \
921             widthvar--; \
922         }
923         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
924         extrawidth = (width % 8);
925         valigner = VEC_ALIGNER(src);
926         vsrc = (vector unsigned char)vec_ld(0, src);
927         width -= extrawidth;
928         while (width) {
929             vector unsigned char valpha;
930             vector unsigned char vsrc1, vsrc2;
931             vector unsigned char vdst1, vdst2;
932             vector unsigned short vR, vG, vB;
933             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
934
935             /* Load 8 pixels from src as ARGB */
936             voverflow = (vector unsigned char)vec_ld(15, src);
937             vsrc = vec_perm(vsrc, voverflow, valigner);
938             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
939             src += 16;
940             vsrc = (vector unsigned char)vec_ld(15, src);
941             voverflow = vec_perm(voverflow, vsrc, valigner);
942             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
943             src += 16;
944
945             /* Load 8 pixels from dst as XRGB */
946             voverflow = vec_ld(0, dst);
947             vR = vec_and((vector unsigned short)voverflow, vf800);
948             vB = vec_sl((vector unsigned short)voverflow, v3_16);
949             vG = vec_sl(vB, v2_16);
950             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
951             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
952             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
953             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
954             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
955             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
956
957             /* Alpha blend 8 pixels as ARGB */
958             valpha = vec_perm(vsrc1, v0, valphaPermute);
959             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
960             valpha = vec_perm(vsrc2, v0, valphaPermute);
961             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
962
963             /* Convert 8 pixels to 565 */
964             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
965             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
966             vgpixel = vec_and(vgpixel, vfc);
967             vgpixel = vec_sl(vgpixel, v3_16);
968             vrpixel = vec_sl(vpixel, v1_16);
969             vrpixel = vec_and(vrpixel, vf800);
970             vbpixel = vec_and(vpixel, v3f);
971             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
972             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
973             
974             /* Store 8 pixels */
975             vec_st(vdst1, 0, dst);
976
977             width -= 8;
978             dst += 16;
979         }
980         ONE_PIXEL_BLEND((extrawidth), extrawidth);
981 #undef ONE_PIXEL_BLEND
982         src += srcskip;
983         dst += dstskip;
984     }
985 }
986
987 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
988 {
989     unsigned alpha = info->src->alpha;
990     int height = info->d_height;
991     Uint32 *srcp = (Uint32 *)info->s_pixels;
992     int srcskip = info->s_skip >> 2;
993     Uint32 *dstp = (Uint32 *)info->d_pixels;
994     int dstskip = info->d_skip >> 2;
995     SDL_PixelFormat *srcfmt = info->src;
996     SDL_PixelFormat *dstfmt = info->dst;
997     unsigned sA = srcfmt->alpha;
998     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
999     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1000     Uint32 ckey = info->src->colorkey;
1001     vector unsigned char mergePermute;
1002     vector unsigned char vsrcPermute;
1003     vector unsigned char vdstPermute;
1004     vector unsigned char vsdstPermute;
1005     vector unsigned char valpha;
1006     vector unsigned char valphamask;
1007     vector unsigned char vbits;
1008     vector unsigned char v0;
1009     vector unsigned short v1;
1010     vector unsigned short v8;
1011     vector unsigned int vckey;
1012     vector unsigned int vrgbmask;
1013
1014     mergePermute = VEC_MERGE_PERMUTE();
1015     v0 = vec_splat_u8(0);
1016     v1 = vec_splat_u16(1);
1017     v8 = vec_splat_u16(8);
1018
1019     /* set the alpha to 255 on the destination surf */
1020     valphamask = VEC_ALPHA_MASK();
1021
1022     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1023     vdstPermute = calc_swizzle32(NULL, dstfmt);
1024     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1025
1026     /* set a vector full of alpha and 255-alpha */
1027     ((unsigned char *)&valpha)[0] = alpha;
1028     valpha = vec_splat(valpha, 0);
1029     vbits = (vector unsigned char)vec_splat_s8(-1);
1030
1031     ckey &= rgbmask;
1032     ((unsigned int *)(char*)&vckey)[0] = ckey;
1033     vckey = vec_splat(vckey, 0);
1034     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1035     vrgbmask = vec_splat(vrgbmask, 0);
1036
1037     while(height--) {
1038         int width = info->d_width;
1039 #define ONE_PIXEL_BLEND(condition, widthvar) \
1040         while (condition) { \
1041             Uint32 Pixel; \
1042             unsigned sR, sG, sB, dR, dG, dB; \
1043             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1044             if(sA && Pixel != ckey) { \
1045                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1046                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1047                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1048                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1049             } \
1050             dstp++; \
1051             srcp++; \
1052             widthvar--; \
1053         }
1054         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1055         if (width > 0) {
1056             int extrawidth = (width % 4);
1057             vector unsigned char valigner = VEC_ALIGNER(srcp);
1058             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1059             width -= extrawidth;
1060             while (width) {
1061                 vector unsigned char vsel;
1062                 vector unsigned char voverflow;
1063                 vector unsigned char vd;
1064                 vector unsigned char vd_orig;
1065
1066                 /* s = *srcp */
1067                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1068                 vs = vec_perm(vs, voverflow, valigner);
1069                 
1070                 /* vsel is set for items that match the key */
1071                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1072                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1073
1074                 /* permute to source format */
1075                 vs = vec_perm(vs, valpha, vsrcPermute);
1076
1077                 /* d = *dstp */
1078                 vd = (vector unsigned char)vec_ld(0, dstp);
1079                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1080
1081                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1082
1083                 /* set the alpha channel to full on */
1084                 vd = vec_or(vd, valphamask);
1085
1086                 /* mask out color key */
1087                 vd = vec_sel(vd, vd_orig, vsel);
1088                 
1089                 /* permute to dest format */
1090                 vd = vec_perm(vd, vbits, vdstPermute);
1091
1092                 /* *dstp = res */
1093                 vec_st((vector unsigned int)vd, 0, dstp);
1094                 
1095                 srcp += 4;
1096                 dstp += 4;
1097                 width -= 4;
1098                 vs = voverflow;
1099             }
1100             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1101         }
1102 #undef ONE_PIXEL_BLEND
1103  
1104         srcp += srcskip;
1105         dstp += dstskip;
1106     }
1107 }
1108
1109
1110 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1111 {
1112     int width = info->d_width;
1113     int height = info->d_height;
1114     Uint32 *srcp = (Uint32 *)info->s_pixels;
1115     int srcskip = info->s_skip >> 2;
1116     Uint32 *dstp = (Uint32 *)info->d_pixels;
1117     int dstskip = info->d_skip >> 2;
1118     SDL_PixelFormat *srcfmt = info->src;
1119     SDL_PixelFormat *dstfmt = info->dst;
1120     vector unsigned char mergePermute;
1121     vector unsigned char valphaPermute;
1122     vector unsigned char vsrcPermute;
1123     vector unsigned char vdstPermute;
1124     vector unsigned char vsdstPermute;
1125     vector unsigned char valphamask;
1126     vector unsigned char vpixelmask;
1127     vector unsigned char v0;
1128     vector unsigned short v1;
1129     vector unsigned short v8;
1130
1131     v0 = vec_splat_u8(0);
1132     v1 = vec_splat_u16(1);
1133     v8 = vec_splat_u16(8);
1134     mergePermute = VEC_MERGE_PERMUTE();
1135     valphamask = VEC_ALPHA_MASK();
1136     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1137     vpixelmask = vec_nor(valphamask, v0);
1138     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1139     vdstPermute = calc_swizzle32(NULL, dstfmt);
1140     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1141
1142         while ( height-- ) {
1143         width = info->d_width;
1144 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1145             Uint32 Pixel; \
1146             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1147             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1148             if(sA) { \
1149               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1150               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1151               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1152             } \
1153             ++srcp; \
1154             ++dstp; \
1155             widthvar--; \
1156         }
1157         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1158         if (width > 0) {
1159             /* vsrcPermute */
1160             /* vdstPermute */
1161             int extrawidth = (width % 4);
1162             vector unsigned char valigner = VEC_ALIGNER(srcp);
1163             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1164             width -= extrawidth;
1165             while (width) {
1166                 vector unsigned char voverflow;
1167                 vector unsigned char vd;
1168                 vector unsigned char valpha;
1169                 vector unsigned char vdstalpha;
1170                 /* s = *srcp */
1171                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1172                 vs = vec_perm(vs, voverflow, valigner);
1173                 vs = vec_perm(vs, v0, vsrcPermute);
1174
1175                 valpha = vec_perm(vs, v0, valphaPermute);
1176                 
1177                 /* d = *dstp */
1178                 vd = (vector unsigned char)vec_ld(0, dstp);
1179                 vd = vec_perm(vd, v0, vsdstPermute);
1180                 vdstalpha = vec_and(vd, valphamask);
1181
1182                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1183
1184                 /* set the alpha to the dest alpha */
1185                 vd = vec_and(vd, vpixelmask);
1186                 vd = vec_or(vd, vdstalpha);
1187                 vd = vec_perm(vd, v0, vdstPermute);
1188
1189                 /* *dstp = res */
1190                 vec_st((vector unsigned int)vd, 0, dstp);
1191                 
1192                 srcp += 4;
1193                 dstp += 4;
1194                 width -= 4;
1195                 vs = voverflow;
1196
1197             }
1198             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1199         }
1200             srcp += srcskip;
1201             dstp += dstskip;
1202 #undef ONE_PIXEL_BLEND
1203         }
1204 }
1205
1206 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1207 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1208 {
1209         int width = info->d_width;
1210         int height = info->d_height;
1211         Uint32 *srcp = (Uint32 *)info->s_pixels;
1212         int srcskip = info->s_skip >> 2;
1213         Uint32 *dstp = (Uint32 *)info->d_pixels;
1214         int dstskip = info->d_skip >> 2;
1215     vector unsigned char mergePermute;
1216     vector unsigned char valphaPermute;
1217     vector unsigned char valphamask;
1218     vector unsigned char vpixelmask;
1219     vector unsigned char v0;
1220     vector unsigned short v1;
1221     vector unsigned short v8;
1222     v0 = vec_splat_u8(0);
1223     v1 = vec_splat_u16(1);
1224     v8 = vec_splat_u16(8);
1225     mergePermute = VEC_MERGE_PERMUTE();
1226     valphamask = VEC_ALPHA_MASK();
1227     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1228     
1229  
1230     vpixelmask = vec_nor(valphamask, v0);
1231         while(height--) {
1232         width = info->d_width;
1233 #define ONE_PIXEL_BLEND(condition, widthvar) \
1234         while ((condition)) { \
1235             Uint32 dalpha; \
1236             Uint32 d; \
1237             Uint32 s1; \
1238             Uint32 d1; \
1239             Uint32 s = *srcp; \
1240             Uint32 alpha = s >> 24; \
1241             if(alpha) { \
1242               if(alpha == SDL_ALPHA_OPAQUE) { \
1243                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1244               } else { \
1245                 d = *dstp; \
1246                 dalpha = d & 0xff000000; \
1247                 s1 = s & 0xff00ff; \
1248                 d1 = d & 0xff00ff; \
1249                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1250                 s &= 0xff00; \
1251                 d &= 0xff00; \
1252                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1253                 *dstp = d1 | d | dalpha; \
1254               } \
1255             } \
1256             ++srcp; \
1257             ++dstp; \
1258             widthvar--; \
1259             }
1260         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1261         if (width > 0) {
1262             int extrawidth = (width % 4);
1263             vector unsigned char valigner = VEC_ALIGNER(srcp);
1264             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1265             width -= extrawidth;
1266             while (width) {
1267                 vector unsigned char voverflow;
1268                 vector unsigned char vd;
1269                 vector unsigned char valpha;
1270                 vector unsigned char vdstalpha;
1271                 /* s = *srcp */
1272                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1273                 vs = vec_perm(vs, voverflow, valigner);
1274
1275                 valpha = vec_perm(vs, v0, valphaPermute);
1276                 
1277                 /* d = *dstp */
1278                 vd = (vector unsigned char)vec_ld(0, dstp);
1279                 vdstalpha = vec_and(vd, valphamask);
1280
1281                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1282
1283                 /* set the alpha to the dest alpha */
1284                 vd = vec_and(vd, vpixelmask);
1285                 vd = vec_or(vd, vdstalpha);
1286
1287                 /* *dstp = res */
1288                 vec_st((vector unsigned int)vd, 0, dstp);
1289                 
1290                 srcp += 4;
1291                 dstp += 4;
1292                 width -= 4;
1293                 vs = voverflow;
1294             }
1295             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1296         }
1297             srcp += srcskip;
1298             dstp += dstskip;
1299         }
1300 #undef ONE_PIXEL_BLEND
1301 }
1302
1303 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1304 {
1305     /* XXX : 6 */
1306         unsigned alpha = info->src->alpha;
1307     int height = info->d_height;
1308     Uint32 *srcp = (Uint32 *)info->s_pixels;
1309     int srcskip = info->s_skip >> 2;
1310     Uint32 *dstp = (Uint32 *)info->d_pixels;
1311     int dstskip = info->d_skip >> 2;
1312     SDL_PixelFormat *srcfmt = info->src;
1313     SDL_PixelFormat *dstfmt = info->dst;
1314         unsigned sA = srcfmt->alpha;
1315         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1316     vector unsigned char mergePermute;
1317     vector unsigned char vsrcPermute;
1318     vector unsigned char vdstPermute;
1319     vector unsigned char vsdstPermute;
1320     vector unsigned char valpha;
1321     vector unsigned char valphamask;
1322     vector unsigned char vbits;
1323     vector unsigned short v1;
1324     vector unsigned short v8;
1325
1326     mergePermute = VEC_MERGE_PERMUTE();
1327     v1 = vec_splat_u16(1);
1328     v8 = vec_splat_u16(8);
1329
1330     /* set the alpha to 255 on the destination surf */
1331     valphamask = VEC_ALPHA_MASK();
1332
1333     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1334     vdstPermute = calc_swizzle32(NULL, dstfmt);
1335     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1336
1337     /* set a vector full of alpha and 255-alpha */
1338     ((unsigned char *)&valpha)[0] = alpha;
1339     valpha = vec_splat(valpha, 0);
1340     vbits = (vector unsigned char)vec_splat_s8(-1);
1341
1342     while(height--) {
1343         int width = info->d_width;
1344 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1345             Uint32 Pixel; \
1346             unsigned sR, sG, sB, dR, dG, dB; \
1347             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1348             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1349             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1350             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1351             ++srcp; \
1352             ++dstp; \
1353             widthvar--; \
1354         }
1355         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1356         if (width > 0) {
1357             int extrawidth = (width % 4);
1358             vector unsigned char valigner = VEC_ALIGNER(srcp);
1359             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1360             width -= extrawidth;
1361             while (width) {
1362                 vector unsigned char voverflow;
1363                 vector unsigned char vd;
1364
1365                 /* s = *srcp */
1366                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1367                 vs = vec_perm(vs, voverflow, valigner);
1368                 vs = vec_perm(vs, valpha, vsrcPermute);
1369                 
1370                 /* d = *dstp */
1371                 vd = (vector unsigned char)vec_ld(0, dstp);
1372                 vd = vec_perm(vd, vd, vsdstPermute);
1373
1374                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1375
1376                 /* set the alpha channel to full on */
1377                 vd = vec_or(vd, valphamask);
1378                 vd = vec_perm(vd, vbits, vdstPermute);
1379
1380                 /* *dstp = res */
1381                 vec_st((vector unsigned int)vd, 0, dstp);
1382                 
1383                 srcp += 4;
1384                 dstp += 4;
1385                 width -= 4;
1386                 vs = voverflow;
1387             }
1388             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1389         }
1390 #undef ONE_PIXEL_BLEND
1391  
1392         srcp += srcskip;
1393         dstp += dstskip;
1394     }
1395
1396 }
1397
1398
1399 /* fast RGB888->(A)RGB888 blending */
1400 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1401 {
1402         unsigned alpha = info->src->alpha;
1403     int height = info->d_height;
1404     Uint32 *srcp = (Uint32 *)info->s_pixels;
1405     int srcskip = info->s_skip >> 2;
1406     Uint32 *dstp = (Uint32 *)info->d_pixels;
1407     int dstskip = info->d_skip >> 2;
1408     vector unsigned char mergePermute;
1409     vector unsigned char valpha;
1410     vector unsigned char valphamask;
1411     vector unsigned short v1;
1412     vector unsigned short v8;
1413
1414     mergePermute = VEC_MERGE_PERMUTE();
1415     v1 = vec_splat_u16(1);
1416     v8 = vec_splat_u16(8);
1417
1418     /* set the alpha to 255 on the destination surf */
1419     valphamask = VEC_ALPHA_MASK();
1420
1421     /* set a vector full of alpha and 255-alpha */
1422     ((unsigned char *)&valpha)[0] = alpha;
1423     valpha = vec_splat(valpha, 0);
1424
1425     while(height--) {
1426         int width = info->d_width;
1427 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1428             Uint32 s = *srcp; \
1429             Uint32 d = *dstp; \
1430             Uint32 s1 = s & 0xff00ff; \
1431             Uint32 d1 = d & 0xff00ff; \
1432             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1433                  & 0xff00ff; \
1434             s &= 0xff00; \
1435             d &= 0xff00; \
1436             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1437             *dstp = d1 | d | 0xff000000; \
1438             ++srcp; \
1439             ++dstp; \
1440             widthvar--; \
1441         }
1442         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1443         if (width > 0) {
1444             int extrawidth = (width % 4);
1445             vector unsigned char valigner = VEC_ALIGNER(srcp);
1446             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1447             width -= extrawidth;
1448             while (width) {
1449                 vector unsigned char voverflow;
1450                 vector unsigned char vd;
1451
1452                 /* s = *srcp */
1453                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1454                 vs = vec_perm(vs, voverflow, valigner);
1455                 
1456                 /* d = *dstp */
1457                 vd = (vector unsigned char)vec_ld(0, dstp);
1458
1459                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1460
1461                 /* set the alpha channel to full on */
1462                 vd = vec_or(vd, valphamask);
1463
1464                 /* *dstp = res */
1465                 vec_st((vector unsigned int)vd, 0, dstp);
1466                 
1467                 srcp += 4;
1468                 dstp += 4;
1469                 width -= 4;
1470                 vs = voverflow;
1471             }
1472             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1473         }
1474 #undef ONE_PIXEL_BLEND
1475  
1476         srcp += srcskip;
1477         dstp += dstskip;
1478     }
1479 }
1480 #if __MWERKS__
1481 #pragma altivec_model off
1482 #endif
1483 #endif /* SDL_ALTIVEC_BLITTERS */
1484
1485 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1486 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1487 {
1488         int width = info->d_width;
1489         int height = info->d_height;
1490         Uint32 *srcp = (Uint32 *)info->s_pixels;
1491         int srcskip = info->s_skip >> 2;
1492         Uint32 *dstp = (Uint32 *)info->d_pixels;
1493         int dstskip = info->d_skip >> 2;
1494
1495         while(height--) {
1496             DUFFS_LOOP4({
1497                     Uint32 s = *srcp++;
1498                     Uint32 d = *dstp;
1499                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1500                                + (s & d & 0x00010101)) | 0xff000000;
1501             }, width);
1502             srcp += srcskip;
1503             dstp += dstskip;
1504         }
1505 }
1506
1507 /* fast RGB888->(A)RGB888 blending with surface alpha */
1508 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1509 {
1510         unsigned alpha = info->src->alpha;
1511         if(alpha == 128) {
1512                 BlitRGBtoRGBSurfaceAlpha128(info);
1513         } else {
1514                 int width = info->d_width;
1515                 int height = info->d_height;
1516                 Uint32 *srcp = (Uint32 *)info->s_pixels;
1517                 int srcskip = info->s_skip >> 2;
1518                 Uint32 *dstp = (Uint32 *)info->d_pixels;
1519                 int dstskip = info->d_skip >> 2;
1520                 Uint32 s;
1521                 Uint32 d;
1522                 Uint32 s1;
1523                 Uint32 d1;
1524
1525                 while(height--) {
1526                         DUFFS_LOOP_DOUBLE2({
1527                                 /* One Pixel Blend */
1528                                 s = *srcp;
1529                                 d = *dstp;
1530                                 s1 = s & 0xff00ff;
1531                                 d1 = d & 0xff00ff;
1532                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1533                                      & 0xff00ff;
1534                                 s &= 0xff00;
1535                                 d &= 0xff00;
1536                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1537                                 *dstp = d1 | d | 0xff000000;
1538                                 ++srcp;
1539                                 ++dstp;
1540                         },{
1541                                 /* Two Pixels Blend */
1542                                 s = *srcp;
1543                                 d = *dstp;
1544                                 s1 = s & 0xff00ff;
1545                                 d1 = d & 0xff00ff;
1546                                 d1 += (s1 - d1) * alpha >> 8;
1547                                 d1 &= 0xff00ff;
1548                                      
1549                                 s = ((s & 0xff00) >> 8) | 
1550                                         ((srcp[1] & 0xff00) << 8);
1551                                 d = ((d & 0xff00) >> 8) |
1552                                         ((dstp[1] & 0xff00) << 8);
1553                                 d += (s - d) * alpha >> 8;
1554                                 d &= 0x00ff00ff;
1555                                 
1556                                 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1557                                 ++srcp;
1558                                 
1559                                 s1 = *srcp;
1560                                 d1 = *dstp;
1561                                 s1 &= 0xff00ff;
1562                                 d1 &= 0xff00ff;
1563                                 d1 += (s1 - d1) * alpha >> 8;
1564                                 d1 &= 0xff00ff;
1565                                 
1566                                 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1567                                 ++srcp;
1568                                 ++dstp;
1569                         }, width);
1570                         srcp += srcskip;
1571                         dstp += dstskip;
1572                 }
1573         }
1574 }
1575
1576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1577 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1578 {
1579         int width = info->d_width;
1580         int height = info->d_height;
1581         Uint32 *srcp = (Uint32 *)info->s_pixels;
1582         int srcskip = info->s_skip >> 2;
1583         Uint32 *dstp = (Uint32 *)info->d_pixels;
1584         int dstskip = info->d_skip >> 2;
1585
1586         while(height--) {
1587             DUFFS_LOOP4({
1588                 Uint32 dalpha;
1589                 Uint32 d;
1590                 Uint32 s1;
1591                 Uint32 d1;
1592                 Uint32 s = *srcp;
1593                 Uint32 alpha = s >> 24;
1594                 /* FIXME: Here we special-case opaque alpha since the
1595                    compositioning used (>>8 instead of /255) doesn't handle
1596                    it correctly. Also special-case alpha=0 for speed?
1597                    Benchmark this! */
1598                 if(alpha) {   
1599                   if(alpha == SDL_ALPHA_OPAQUE) {
1600                     *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1601                   } else {
1602                     /*
1603                      * take out the middle component (green), and process
1604                      * the other two in parallel. One multiply less.
1605                      */
1606                     d = *dstp;
1607                     dalpha = d & 0xff000000;
1608                     s1 = s & 0xff00ff;
1609                     d1 = d & 0xff00ff;
1610                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1611                     s &= 0xff00;
1612                     d &= 0xff00;
1613                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1614                     *dstp = d1 | d | dalpha;
1615                   }
1616                 }
1617                 ++srcp;
1618                 ++dstp;
1619             }, width);
1620             srcp += srcskip;
1621             dstp += dstskip;
1622         }
1623 }
1624
1625 #if GCC_ASMBLIT
1626 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1627 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1628 {
1629         int width = info->d_width;
1630         int height = info->d_height;
1631         Uint32 *srcp = (Uint32 *)info->s_pixels;
1632         int srcskip = info->s_skip >> 2;
1633         Uint32 *dstp = (Uint32 *)info->d_pixels;
1634         int dstskip = info->d_skip >> 2;
1635         SDL_PixelFormat* sf = info->src;
1636         Uint32 amask = sf->Amask;
1637
1638         __asm__ (
1639         /* make mm6 all zeros. */
1640         "pxor       %%mm6, %%mm6\n"
1641         
1642         /* Make a mask to preserve the alpha. */
1643         "movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1644         "punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1645         "pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1646         "movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1647         "pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1648
1649         /* form channel masks */
1650         "movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1651         "packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1652         "packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1653         "pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1654         
1655         /* get alpha channel shift */
1656         "movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1657
1658           : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1659
1660         while(height--) {
1661
1662             DUFFS_LOOP4({
1663                 Uint32 alpha;
1664
1665                 __asm__ (
1666                 "prefetch 64(%0)\n"
1667                 "prefetch 64(%1)\n"
1668                         : : "r" (srcp), "r" (dstp) );
1669
1670                 alpha = *srcp & amask;
1671                 /* FIXME: Here we special-case opaque alpha since the
1672                    compositioning used (>>8 instead of /255) doesn't handle
1673                    it correctly. Also special-case alpha=0 for speed?
1674                    Benchmark this! */
1675                 if(alpha == 0) {
1676                     /* do nothing */
1677                 }
1678                 else if(alpha == amask) {
1679                         /* opaque alpha -- copy RGB, keep dst alpha */
1680                     /* using MMX here to free up regular registers for other things */
1681                             __asm__ (
1682                     "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1683                     "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1684                     "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1685                     "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1686                     "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1687                     "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1688
1689                      : : "r" (srcp), "r" (dstp) );
1690                 } 
1691
1692                 else {
1693                             __asm__ (
1694                     /* load in the source, and dst. */
1695                     "movd      (%0), %%mm0\n"               /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1696                     "movd      (%1), %%mm1\n"               /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1697
1698                     /* Move the src alpha into mm2 */
1699
1700                     /* if supporting pshufw */
1701                     /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1702                     /*"psrlw     $8, %%mm2\n" */
1703                     
1704                     /* else: */
1705                     "movd       %2,    %%mm2\n"
1706                     "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1707                     "punpcklwd  %%mm2, %%mm2\n"             /* mm2 = 0 0 0 0 |  0 As  0  As */
1708                     "punpckldq  %%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1709                     "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1710
1711                     /* move the colors into words. */
1712                     "punpcklbw %%mm6, %%mm0\n"              /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1713                     "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1714
1715                     /* src - dst */
1716                     "psubw    %%mm1, %%mm0\n"               /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1717
1718                     /* A * (src-dst) */
1719                     "pmullw    %%mm2, %%mm0\n"              /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1720                     "psrlw     $8,    %%mm0\n"              /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1721                     "paddb     %%mm1, %%mm0\n"              /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1722
1723                     "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1724                     
1725                     "movd      %%mm0, (%1)\n"               /* result in mm0 */
1726
1727                      : : "r" (srcp), "r" (dstp), "r" (alpha) );
1728
1729                 }
1730                 ++srcp;
1731                 ++dstp;
1732             }, width);
1733             srcp += srcskip;
1734             dstp += dstskip;
1735         }
1736
1737         __asm__ (
1738         "emms\n"
1739                 :   );
1740 }
1741 /* End GCC_ASMBLIT*/
1742
1743 #elif MSVC_ASMBLIT
1744 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1745 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1746 {
1747         int width = info->d_width;
1748         int height = info->d_height;
1749         Uint32 *srcp = (Uint32 *)info->s_pixels;
1750         int srcskip = info->s_skip >> 2;
1751         Uint32 *dstp = (Uint32 *)info->d_pixels;
1752         int dstskip = info->d_skip >> 2;
1753         SDL_PixelFormat* sf = info->src;
1754         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1755         Uint32 amask = sf->Amask;
1756         Uint32 ashift = sf->Ashift;
1757         Uint64 multmask;
1758         
1759         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1760
1761         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1762         multmask = ~(0xFFFFi64 << (ashift * 2));
1763         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1764
1765         while(height--) {
1766             DUFFS_LOOP4({
1767                 Uint32 alpha;
1768
1769                 _m_prefetch(srcp + 16);
1770                 _m_prefetch(dstp + 16);
1771
1772                 alpha = *srcp & amask;
1773                 if (alpha == 0) {
1774                         /* do nothing */
1775                 } else if (alpha == amask) {
1776                         /* copy RGB, keep dst alpha */
1777                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1778                 } else {
1779                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1780                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1781
1782                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1783                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1784
1785                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1786                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1787                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1788                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1789                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1790
1791                         /* blend */                 
1792                         src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1793                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1794                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1795                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1796                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1797                         
1798                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1799                 }
1800                 ++srcp;
1801                 ++dstp;
1802             }, width);
1803             srcp += srcskip;
1804             dstp += dstskip;
1805         }
1806         _mm_empty();
1807 }
1808 /* End MSVC_ASMBLIT */
1809
1810 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1811
1812 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1813
1814 /* blend a single 16 bit pixel at 50% */
1815 #define BLEND16_50(d, s, mask)                                          \
1816         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1817
1818 /* blend two 16 bit pixels at 50% */
1819 #define BLEND2x16_50(d, s, mask)                                             \
1820         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1821          + (s & d & (~(mask | mask << 16))))
1822
1823 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1824 {
1825         int width = info->d_width;
1826         int height = info->d_height;
1827         Uint16 *srcp = (Uint16 *)info->s_pixels;
1828         int srcskip = info->s_skip >> 1;
1829         Uint16 *dstp = (Uint16 *)info->d_pixels;
1830         int dstskip = info->d_skip >> 1;
1831
1832         while(height--) {
1833                 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1834                         /*
1835                          * Source and destination not aligned, pipeline it.
1836                          * This is mostly a win for big blits but no loss for
1837                          * small ones
1838                          */
1839                         Uint32 prev_sw;
1840                         int w = width;
1841
1842                         /* handle odd destination */
1843                         if((uintptr_t)dstp & 2) {
1844                                 Uint16 d = *dstp, s = *srcp;
1845                                 *dstp = BLEND16_50(d, s, mask);
1846                                 dstp++;
1847                                 srcp++;
1848                                 w--;
1849                         }
1850                         srcp++; /* srcp is now 32-bit aligned */
1851
1852                         /* bootstrap pipeline with first halfword */
1853                         prev_sw = ((Uint32 *)srcp)[-1];
1854
1855                         while(w > 1) {
1856                                 Uint32 sw, dw, s;
1857                                 sw = *(Uint32 *)srcp;
1858                                 dw = *(Uint32 *)dstp;
1859 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1860                                 s = (prev_sw << 16) + (sw >> 16);
1861 #else
1862                                 s = (prev_sw >> 16) + (sw << 16);
1863 #endif
1864                                 prev_sw = sw;
1865                                 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1866                                 dstp += 2;
1867                                 srcp += 2;
1868                                 w -= 2;
1869                         }
1870
1871                         /* final pixel if any */
1872                         if(w) {
1873                                 Uint16 d = *dstp, s;
1874 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1875                                 s = (Uint16)prev_sw;
1876 #else
1877                                 s = (Uint16)(prev_sw >> 16);
1878 #endif
1879                                 *dstp = BLEND16_50(d, s, mask);
1880                                 srcp++;
1881                                 dstp++;
1882                         }
1883                         srcp += srcskip - 1;
1884                         dstp += dstskip;
1885                 } else {
1886                         /* source and destination are aligned */
1887                         int w = width;
1888
1889                         /* first odd pixel? */
1890                         if((uintptr_t)srcp & 2) {
1891                                 Uint16 d = *dstp, s = *srcp;
1892                                 *dstp = BLEND16_50(d, s, mask);
1893                                 srcp++;
1894                                 dstp++;
1895                                 w--;
1896                         }
1897                         /* srcp and dstp are now 32-bit aligned */
1898
1899                         while(w > 1) {
1900                                 Uint32 sw = *(Uint32 *)srcp;
1901                                 Uint32 dw = *(Uint32 *)dstp;
1902                                 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1903                                 srcp += 2;
1904                                 dstp += 2;
1905                                 w -= 2;
1906                         }
1907
1908                         /* last odd pixel? */
1909                         if(w) {
1910                                 Uint16 d = *dstp, s = *srcp;
1911                                 *dstp = BLEND16_50(d, s, mask);
1912                                 srcp++;
1913                                 dstp++;
1914                         }
1915                         srcp += srcskip;
1916                         dstp += dstskip;
1917                 }
1918         }
1919 }
1920
1921 #if GCC_ASMBLIT
1922 /* fast RGB565->RGB565 blending with surface alpha */
1923 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1924 {
1925         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1926         if(alpha == 128) {
1927                 Blit16to16SurfaceAlpha128(info, 0xf7de);
1928         } else {
1929                 int width = info->d_width;
1930                 int height = info->d_height;
1931                 Uint16 *srcp = (Uint16 *)info->s_pixels;
1932                 int srcskip = info->s_skip >> 1;
1933                 Uint16 *dstp = (Uint16 *)info->d_pixels;
1934                 int dstskip = info->d_skip >> 1;
1935                 Uint32 s, d;
1936                 Uint64 load;
1937           
1938                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
1939                 load = alpha;
1940                 alpha >>= 3;            /* downscale alpha to 5 bits */
1941
1942                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1943                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1944                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1945                 /* position alpha to allow for mullo and mulhi on diff channels
1946                    to reduce the number of operations */
1947                 psllq_i2r(3, mm0);
1948           
1949                 /* Setup the 565 color channel masks */
1950                 load = 0x07E007E007E007E0ULL;
1951                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1952                 load = 0x001F001F001F001FULL;
1953                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1954                 while(height--) {
1955                         DUFFS_LOOP_QUATRO2(
1956                         {
1957                                 s = *srcp++;
1958                                 d = *dstp;
1959                                 /*
1960                                  * shift out the middle component (green) to
1961                                  * the high 16 bits, and process all three RGB
1962                                  * components at the same time.
1963                                  */
1964                                 s = (s | s << 16) & 0x07e0f81f;
1965                                 d = (d | d << 16) & 0x07e0f81f;
1966                                 d += (s - d) * alpha >> 5;
1967                                 d &= 0x07e0f81f;
1968                                 *dstp++ = d | d >> 16;
1969                         },{
1970                                 s = *srcp++;
1971                                 d = *dstp;
1972                                 /*
1973                                  * shift out the middle component (green) to
1974                                  * the high 16 bits, and process all three RGB
1975                                  * components at the same time.
1976                                  */
1977                                 s = (s | s << 16) & 0x07e0f81f;
1978                                 d = (d | d << 16) & 0x07e0f81f;
1979                                 d += (s - d) * alpha >> 5;
1980                                 d &= 0x07e0f81f;
1981                                 *dstp++ = d | d >> 16;
1982                                 s = *srcp++;
1983                                 d = *dstp;
1984                                 /*
1985                                  * shift out the middle component (green) to
1986                                  * the high 16 bits, and process all three RGB
1987                                  * components at the same time.
1988                                  */
1989                                 s = (s | s << 16) & 0x07e0f81f;
1990                                 d = (d | d << 16) & 0x07e0f81f;
1991                                 d += (s - d) * alpha >> 5;
1992                                 d &= 0x07e0f81f;
1993                                 *dstp++ = d | d >> 16;
1994                         },{
1995                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1996                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1997
1998                                 /* red -- does not need a mask since the right shift clears
1999                                    the uninteresting bits */
2000                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2001                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2002                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2003                                 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2004
2005                                 /* blend */
2006                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2007                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2008                                 /* alpha used is actually 11 bits
2009                                    11 + 5 = 16 bits, so the sign bits are lost */
2010                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2011                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2012                                 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2013
2014                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2015
2016                                 /* green -- process the bits in place */
2017                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2018                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2019                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2020                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2021
2022                                 /* blend */
2023                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2024                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2025                                 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2026                                    bits are gone and the sign bits present */
2027                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2028                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2029
2030                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2031
2032                                 /* blue */
2033                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2034                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2035                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2036                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2037
2038                                 /* blend */
2039                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2040                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2041                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2042                                    the interesting bits will need to be MASKed */
2043                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2044                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2045                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2046
2047                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2048
2049                                 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2050
2051                                 srcp += 4;
2052                                 dstp += 4;
2053                         }, width);                      
2054                         srcp += srcskip;
2055                         dstp += dstskip;
2056                 }
2057                 emms();
2058         }
2059 }
2060
2061 /* fast RGB555->RGB555 blending with surface alpha */
2062 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2063 {
2064         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2065         if(alpha == 128) {
2066                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2067         } else {
2068                 int width = info->d_width;
2069                 int height = info->d_height;
2070                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2071                 int srcskip = info->s_skip >> 1;
2072                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2073                 int dstskip = info->d_skip >> 1;
2074                 Uint32 s, d;
2075                 Uint64 load;
2076           
2077                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2078                 load = alpha;
2079                 alpha >>= 3;            /* downscale alpha to 5 bits */
2080
2081                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2082                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2083                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2084                 /* position alpha to allow for mullo and mulhi on diff channels
2085                    to reduce the number of operations */
2086                 psllq_i2r(3, mm0);
2087
2088                 /* Setup the 555 color channel masks */
2089                 load = 0x03E003E003E003E0ULL;
2090                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2091                 load = 0x001F001F001F001FULL;
2092                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2093                 while(height--) {
2094                         DUFFS_LOOP_QUATRO2(
2095                         {
2096                                 s = *srcp++;
2097                                 d = *dstp;
2098                                 /*
2099                                  * shift out the middle component (green) to
2100                                  * the high 16 bits, and process all three RGB
2101                                  * components at the same time.
2102                                  */
2103                                 s = (s | s << 16) & 0x03e07c1f;
2104                                 d = (d | d << 16) & 0x03e07c1f;
2105                                 d += (s - d) * alpha >> 5;
2106                                 d &= 0x03e07c1f;
2107                                 *dstp++ = d | d >> 16;
2108                         },{
2109                                 s = *srcp++;
2110                                 d = *dstp;
2111                                 /*
2112                                  * shift out the middle component (green) to
2113                                  * the high 16 bits, and process all three RGB
2114                                  * components at the same time.
2115                                  */
2116                                 s = (s | s << 16) & 0x03e07c1f;
2117                                 d = (d | d << 16) & 0x03e07c1f;
2118                                 d += (s - d) * alpha >> 5;
2119                                 d &= 0x03e07c1f;
2120                                 *dstp++ = d | d >> 16;
2121                                 s = *srcp++;
2122                                 d = *dstp;
2123                                 /*
2124                                  * shift out the middle component (green) to
2125                                  * the high 16 bits, and process all three RGB
2126                                  * components at the same time.
2127                                  */
2128                                 s = (s | s << 16) & 0x03e07c1f;
2129                                 d = (d | d << 16) & 0x03e07c1f;
2130                                 d += (s - d) * alpha >> 5;
2131                                 d &= 0x03e07c1f;
2132                                 *dstp++ = d | d >> 16;
2133                         },{
2134                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2135                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2136
2137                                 /* red -- process the bits in place */
2138                                 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2139                                         /* by reusing the GREEN mask we free up another mmx
2140                                            register to accumulate the result */
2141
2142                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2143                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2144                                 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2145                                 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2146
2147                                 /* blend */
2148                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2149                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2150                                 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2151                                    cleared by a MASK below */
2152                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2153                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2154                                 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2155
2156                                 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2157
2158                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2159
2160                                 /* green -- process the bits in place */
2161                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2162                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2163                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2164                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2165
2166                                 /* blend */
2167                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2168                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2169                                 /* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2170                                    bits are gone and the sign bits present */
2171                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2172                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2173
2174                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2175
2176                                 /* blue */
2177                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2178                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2179                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2180                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2181
2182                                 /* blend */
2183                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2184                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2185                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2186                                    the interesting bits will need to be MASKed */
2187                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2188                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2189                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2190
2191                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2192
2193                                 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2194
2195                                 srcp += 4;
2196                                 dstp += 4;
2197                         }, width);                      
2198                         srcp += srcskip;
2199                         dstp += dstskip;
2200                 }
2201                 emms();
2202         }
2203 }
2204 /* End GCC_ASMBLIT */
2205
2206 #elif MSVC_ASMBLIT
2207 /* fast RGB565->RGB565 blending with surface alpha */
2208 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2209 {
2210         unsigned alpha = info->src->alpha;
2211         if(alpha == 128) {
2212                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2213         } else {
2214                 int width = info->d_width;
2215                 int height = info->d_height;
2216                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2217                 int srcskip = info->s_skip >> 1;
2218                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2219                 int dstskip = info->d_skip >> 1;
2220                 Uint32 s, d;
2221           
2222                 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2223
2224                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2225                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2226                 alpha >>= 3;            /* downscale alpha to 5 bits */
2227
2228                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2229                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2230                 /* position alpha to allow for mullo and mulhi on diff channels
2231                    to reduce the number of operations */
2232                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2233           
2234                 /* Setup the 565 color channel masks */
2235                 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2236                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2237                 
2238                 while(height--) {
2239                         DUFFS_LOOP_QUATRO2(
2240                         {
2241                                 s = *srcp++;
2242                                 d = *dstp;
2243                                 /*
2244                                  * shift out the middle component (green) to
2245                                  * the high 16 bits, and process all three RGB
2246                                  * components at the same time.
2247                                  */
2248                                 s = (s | s << 16) & 0x07e0f81f;
2249                                 d = (d | d << 16) & 0x07e0f81f;
2250                                 d += (s - d) * alpha >> 5;
2251                                 d &= 0x07e0f81f;
2252                                 *dstp++ = (Uint16)(d | d >> 16);
2253                         },{
2254                                 s = *srcp++;
2255                                 d = *dstp;
2256                                 /*
2257                                  * shift out the middle component (green) to
2258                                  * the high 16 bits, and process all three RGB
2259                                  * components at the same time.
2260                                  */
2261                                 s = (s | s << 16) & 0x07e0f81f;
2262                                 d = (d | d << 16) & 0x07e0f81f;
2263                                 d += (s - d) * alpha >> 5;
2264                                 d &= 0x07e0f81f;
2265                                 *dstp++ = (Uint16)(d | d >> 16);
2266                                 s = *srcp++;
2267                                 d = *dstp;
2268                                 /*
2269                                  * shift out the middle component (green) to
2270                                  * the high 16 bits, and process all three RGB
2271                                  * components at the same time.
2272                                  */
2273                                 s = (s | s << 16) & 0x07e0f81f;
2274                                 d = (d | d << 16) & 0x07e0f81f;
2275                                 d += (s - d) * alpha >> 5;
2276                                 d &= 0x07e0f81f;
2277                                 *dstp++ = (Uint16)(d | d >> 16);
2278                         },{
2279                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2280                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2281
2282                                 /* red */
2283                                 src2 = src1;
2284                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2285
2286                                 dst2 = dst1;
2287                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2288
2289                                 /* blend */
2290                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2291                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2292                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2293                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2294                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2295
2296                                 mm_res = dst2; /* RED -> mm_res */
2297
2298                                 /* green -- process the bits in place */
2299                                 src2 = src1;
2300                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2301
2302                                 dst2 = dst1;
2303                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2304
2305                                 /* blend */
2306                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2307                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2308                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2309                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2310
2311                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2312
2313                                 /* blue */
2314                                 src2 = src1;
2315                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2316
2317                                 dst2 = dst1;
2318                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2319
2320                                 /* blend */
2321                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2322                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2323                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2324                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2325                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2326
2327                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2328
2329                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2330
2331                                 srcp += 4;
2332                                 dstp += 4;
2333                         }, width);                      
2334                         srcp += srcskip;
2335                         dstp += dstskip;
2336                 }
2337                 _mm_empty();
2338         }
2339 }
2340
2341 /* fast RGB555->RGB555 blending with surface alpha */
2342 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2343 {
2344         unsigned alpha = info->src->alpha;
2345         if(alpha == 128) {
2346                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2347         } else {
2348                 int width = info->d_width;
2349                 int height = info->d_height;
2350                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2351                 int srcskip = info->s_skip >> 1;
2352                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2353                 int dstskip = info->d_skip >> 1;
2354                 Uint32 s, d;
2355           
2356                 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2357
2358                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2359                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2360                 alpha >>= 3;            /* downscale alpha to 5 bits */
2361
2362                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2363                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2364                 /* position alpha to allow for mullo and mulhi on diff channels
2365                    to reduce the number of operations */
2366                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2367           
2368                 /* Setup the 555 color channel masks */
2369                 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2370                 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2371                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2372
2373                 while(height--) {
2374                         DUFFS_LOOP_QUATRO2(
2375                         {
2376                                 s = *srcp++;
2377                                 d = *dstp;
2378                                 /*
2379                                  * shift out the middle component (green) to
2380                                  * the high 16 bits, and process all three RGB
2381                                  * components at the same time.
2382                                  */
2383                                 s = (s | s << 16) & 0x03e07c1f;
2384                                 d = (d | d << 16) & 0x03e07c1f;
2385                                 d += (s - d) * alpha >> 5;
2386                                 d &= 0x03e07c1f;
2387                                 *dstp++ = (Uint16)(d | d >> 16);
2388                         },{
2389                                 s = *srcp++;
2390                                 d = *dstp;
2391                                 /*
2392                                  * shift out the middle component (green) to
2393                                  * the high 16 bits, and process all three RGB
2394                                  * components at the same time.
2395                                  */
2396                                 s = (s | s << 16) & 0x03e07c1f;
2397                                 d = (d | d << 16) & 0x03e07c1f;
2398                                 d += (s - d) * alpha >> 5;
2399                                 d &= 0x03e07c1f;
2400                                 *dstp++ = (Uint16)(d | d >> 16);
2401                                 s = *srcp++;
2402                                 d = *dstp;
2403                                 /*
2404                                  * shift out the middle component (green) to
2405                                  * the high 16 bits, and process all three RGB
2406                                  * components at the same time.
2407                                  */
2408                                 s = (s | s << 16) & 0x03e07c1f;
2409                                 d = (d | d << 16) & 0x03e07c1f;
2410                                 d += (s - d) * alpha >> 5;
2411                                 d &= 0x03e07c1f;
2412                                 *dstp++ = (Uint16)(d | d >> 16);
2413                         },{
2414                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2415                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2416
2417                                 /* red -- process the bits in place */
2418                                 src2 = src1;
2419                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2420
2421                                 dst2 = dst1;
2422                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2423
2424                                 /* blend */
2425                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2426                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2427                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2428                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2429                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2430
2431                                 mm_res = dst2; /* RED -> mm_res */
2432                                 
2433                                 /* green -- process the bits in place */
2434                                 src2 = src1;
2435                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2436
2437                                 dst2 = dst1;
2438                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2439
2440                                 /* blend */
2441                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2442                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2443                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2444                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2445
2446                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2447
2448                                 /* blue */
2449                                 src2 = src1; /* src -> src2 */
2450                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2451
2452                                 dst2 = dst1; /* dst -> dst2 */
2453                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2454
2455                                 /* blend */
2456                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2457                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2458                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2459                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2460                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2461
2462                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2463
2464                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2465
2466                                 srcp += 4;
2467                                 dstp += 4;
2468                         }, width);                      
2469                         srcp += srcskip;
2470                         dstp += dstskip;
2471                 }
2472                 _mm_empty();
2473         }
2474 }
2475 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2476
2477 /* fast RGB565->RGB565 blending with surface alpha */
2478 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2479 {
2480         unsigned alpha = info->src->alpha;
2481         if(alpha == 128) {
2482                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2483         } else {
2484                 int width = info->d_width;
2485                 int height = info->d_height;
2486                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2487                 int srcskip = info->s_skip >> 1;
2488                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2489                 int dstskip = info->d_skip >> 1;
2490                 alpha >>= 3;    /* downscale alpha to 5 bits */
2491
2492                 while(height--) {
2493                         DUFFS_LOOP4({
2494                                 Uint32 s = *srcp++;
2495                                 Uint32 d = *dstp;
2496                                 /*
2497                                  * shift out the middle component (green) to
2498                                  * the high 16 bits, and process all three RGB
2499                                  * components at the same time.
2500                                  */
2501                                 s = (s | s << 16) & 0x07e0f81f;
2502                                 d = (d | d << 16) & 0x07e0f81f;
2503                                 d += (s - d) * alpha >> 5;
2504                                 d &= 0x07e0f81f;
2505                                 *dstp++ = (Uint16)(d | d >> 16);
2506                         }, width);
2507                         srcp += srcskip;
2508                         dstp += dstskip;
2509                 }
2510         }
2511 }
2512
2513 /* fast RGB555->RGB555 blending with surface alpha */
2514 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2515 {
2516         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2517         if(alpha == 128) {
2518                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2519         } else {
2520                 int width = info->d_width;
2521                 int height = info->d_height;
2522                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2523                 int srcskip = info->s_skip >> 1;
2524                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2525                 int dstskip = info->d_skip >> 1;
2526                 alpha >>= 3;            /* downscale alpha to 5 bits */
2527
2528                 while(height--) {
2529                         DUFFS_LOOP4({
2530                                 Uint32 s = *srcp++;
2531                                 Uint32 d = *dstp;
2532                                 /*
2533                                  * shift out the middle component (green) to
2534                                  * the high 16 bits, and process all three RGB
2535                                  * components at the same time.
2536                                  */
2537                                 s = (s | s << 16) & 0x03e07c1f;
2538                                 d = (d | d << 16) & 0x03e07c1f;
2539                                 d += (s - d) * alpha >> 5;
2540                                 d &= 0x03e07c1f;
2541                                 *dstp++ = (Uint16)(d | d >> 16);
2542                         }, width);
2543                         srcp += srcskip;
2544                         dstp += dstskip;
2545                 }
2546         }
2547 }
2548
2549 /* fast ARGB8888->RGB565 blending with pixel alpha */
2550 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2551 {
2552         int width = info->d_width;
2553         int height = info->d_height;
2554         Uint32 *srcp = (Uint32 *)info->s_pixels;
2555         int srcskip = info->s_skip >> 2;
2556         Uint16 *dstp = (Uint16 *)info->d_pixels;
2557         int dstskip = info->d_skip >> 1;
2558
2559         while(height--) {
2560             DUFFS_LOOP4({
2561                 Uint32 s = *srcp;
2562                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2563                 /* FIXME: Here we special-case opaque alpha since the
2564                    compositioning used (>>8 instead of /255) doesn't handle
2565                    it correctly. Also special-case alpha=0 for speed?
2566                    Benchmark this! */
2567                 if(alpha) {   
2568                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2569                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2570                   } else {
2571                     Uint32 d = *dstp;
2572                     /*
2573                      * convert source and destination to G0RAB65565
2574                      * and blend all components at the same time
2575                      */
2576                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2577                       + (s >> 3 & 0x1f);
2578                     d = (d | d << 16) & 0x07e0f81f;
2579                     d += (s - d) * alpha >> 5;
2580                     d &= 0x07e0f81f;
2581                     *dstp = (Uint16)(d | d >> 16);
2582                   }
2583                 }
2584                 srcp++;
2585                 dstp++;
2586             }, width);
2587             srcp += srcskip;
2588             dstp += dstskip;
2589         }
2590 }
2591
2592 /* fast ARGB8888->RGB555 blending with pixel alpha */
2593 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2594 {
2595         int width = info->d_width;
2596         int height = info->d_height;
2597         Uint32 *srcp = (Uint32 *)info->s_pixels;
2598         int srcskip = info->s_skip >> 2;
2599         Uint16 *dstp = (Uint16 *)info->d_pixels;
2600         int dstskip = info->d_skip >> 1;
2601
2602         while(height--) {
2603             DUFFS_LOOP4({
2604                 unsigned alpha;
2605                 Uint32 s = *srcp;
2606                 alpha = s >> 27; /* downscale alpha to 5 bits */
2607                 /* FIXME: Here we special-case opaque alpha since the
2608                    compositioning used (>>8 instead of /255) doesn't handle
2609                    it correctly. Also special-case alpha=0 for speed?
2610                    Benchmark this! */
2611                 if(alpha) {   
2612                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2613                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2614                   } else {
2615                     Uint32 d = *dstp;
2616                     /*
2617                      * convert source and destination to G0RAB65565
2618                      * and blend all components at the same time
2619                      */
2620                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2621                       + (s >> 3 & 0x1f);
2622                     d = (d | d << 16) & 0x03e07c1f;
2623                     d += (s - d) * alpha >> 5;
2624                     d &= 0x03e07c1f;
2625                     *dstp = (Uint16)(d | d >> 16);
2626                   }
2627                 }
2628                 srcp++;
2629                 dstp++;
2630             }, width);
2631             srcp += srcskip;
2632             dstp += dstskip;
2633         }
2634 }
2635
2636 /* General (slow) N->N blending with per-surface alpha */
2637 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2638 {
2639         int width = info->d_width;
2640         int height = info->d_height;
2641         Uint8 *src = info->s_pixels;
2642         int srcskip = info->s_skip;
2643         Uint8 *dst = info->d_pixels;
2644         int dstskip = info->d_skip;
2645         SDL_PixelFormat *srcfmt = info->src;
2646         SDL_PixelFormat *dstfmt = info->dst;
2647         int srcbpp = srcfmt->BytesPerPixel;
2648         int dstbpp = dstfmt->BytesPerPixel;
2649         unsigned sA = srcfmt->alpha;
2650         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2651
2652         if(sA) {
2653           while ( height-- ) {
2654             DUFFS_LOOP4(
2655             {
2656                 Uint32 Pixel;
2657                 unsigned sR;
2658                 unsigned sG;
2659                 unsigned sB;
2660                 unsigned dR;
2661                 unsigned dG;
2662                 unsigned dB;
2663                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2664                 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2665                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2666                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2667                 src += srcbpp;
2668                 dst += dstbpp;
2669             },
2670             width);
2671             src += srcskip;
2672             dst += dstskip;
2673           }
2674         }
2675 }
2676
2677 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2678 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2679 {
2680         int width = info->d_width;
2681         int height = info->d_height;
2682         Uint8 *src = info->s_pixels;
2683         int srcskip = info->s_skip;
2684         Uint8 *dst = info->d_pixels;
2685         int dstskip = info->d_skip;
2686         SDL_PixelFormat *srcfmt = info->src;
2687         SDL_PixelFormat *dstfmt = info->dst;
2688         Uint32 ckey = srcfmt->colorkey;
2689         int srcbpp = srcfmt->BytesPerPixel;
2690         int dstbpp = dstfmt->BytesPerPixel;
2691         unsigned sA = srcfmt->alpha;
2692         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2693
2694         if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2695             Uint16 *src16 = (Uint16 *)src;
2696             Uint16 *dst16 = (Uint16 *)dst;
2697             sA >>= 3;   /* downscale alpha to 5 bits */
2698             while ( height-- ) {
2699                 DUFFS_LOOP4(
2700                 {
2701                     Uint32 s;
2702                     Uint32 d;
2703                     s = *src16;
2704                     if(sA && s != ckey) {
2705                         d = *dst16;
2706                         s = (s | s << 16) & 0x07e0f81f;
2707                         d = (d | d << 16) & 0x07e0f81f;
2708                         d += (s - d) * sA >> 5;
2709                         d &= 0x07e0f81f;
2710                         *dst16 = (Uint16)(d | d >> 16);
2711                     }
2712                     src16++;
2713                     dst16++;
2714                 },
2715                 width);
2716                 src16 += srcskip / 2;
2717                 dst16 += dstskip / 2;
2718             }
2719             return;
2720         }
2721
2722         while ( height-- ) {
2723             DUFFS_LOOP4(
2724             {
2725                 Uint32 Pixel;
2726                 unsigned sR;
2727                 unsigned sG;
2728                 unsigned sB;
2729                 unsigned dR;
2730                 unsigned dG;
2731                 unsigned dB;
2732                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2733                 if(sA && Pixel != ckey) {
2734                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2735                     DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2736                     ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2737                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2738                 }
2739                 src += srcbpp;
2740                 dst += dstbpp;
2741             },
2742             width);
2743             src += srcskip;
2744             dst += dstskip;
2745         }
2746 }
2747
2748 /* General (slow) N->N blending with pixel alpha */
2749 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2750 {
2751         int width = info->d_width;
2752         int height = info->d_height;
2753         Uint8 *src = info->s_pixels;
2754         int srcskip = info->s_skip;
2755         Uint8 *dst = info->d_pixels;
2756         int dstskip = info->d_skip;
2757         SDL_PixelFormat *srcfmt = info->src;
2758         SDL_PixelFormat *dstfmt = info->dst;
2759
2760         int  srcbpp;
2761         int  dstbpp;
2762
2763         /* Set up some basic variables */
2764         srcbpp = srcfmt->BytesPerPixel;
2765         dstbpp = dstfmt->BytesPerPixel;
2766
2767         /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2768            quite right. for <8bpp source alpha, it gets them very wrong
2769            (check all macros!)
2770            It is unclear whether there is a good general solution that doesn't
2771            need a branch (or a divide). */
2772         while ( height-- ) {
2773             DUFFS_LOOP4(
2774             {
2775                 Uint32 Pixel;
2776                 unsigned sR;
2777                 unsigned sG;
2778                 unsigned sB;
2779                 unsigned dR;
2780                 unsigned dG;
2781                 unsigned dB;
2782                 unsigned sA;
2783                 unsigned dA;
2784                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2785                 if(sA) {
2786                   DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2787                   ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2788                   ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2789                 }
2790                 src += srcbpp;
2791                 dst += dstbpp;
2792             },
2793             width);
2794             src += srcskip;
2795             dst += dstskip;
2796         }
2797 }
2798
2799
2800 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2801 {
2802     SDL_PixelFormat *sf = surface->format;
2803     SDL_PixelFormat *df = surface->map->dst->format;
2804
2805     if(sf->Amask == 0) {
2806         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2807             if(df->BytesPerPixel == 1)
2808                 return BlitNto1SurfaceAlphaKey;
2809             else
2810 #if SDL_ALTIVEC_BLITTERS
2811         if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2812             !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2813             return Blit32to32SurfaceAlphaKeyAltivec;
2814         else
2815 #endif
2816             return BlitNtoNSurfaceAlphaKey;
2817         } else {
2818             /* Per-surface alpha blits */
2819             switch(df->BytesPerPixel) {
2820             case 1:
2821                 return BlitNto1SurfaceAlpha;
2822
2823             case 2:
2824                 if(surface->map->identity) {
2825                     if(df->Gmask == 0x7e0)
2826                     {
2827 #if MMX_ASMBLIT
2828                 if(SDL_HasMMX())
2829                         return Blit565to565SurfaceAlphaMMX;
2830                 else
2831 #endif
2832                         return Blit565to565SurfaceAlpha;
2833                     }
2834                     else if(df->Gmask == 0x3e0)
2835                     {
2836 #if MMX_ASMBLIT
2837                 if(SDL_HasMMX())
2838                         return Blit555to555SurfaceAlphaMMX;
2839                 else
2840 #endif
2841                         return Blit555to555SurfaceAlpha;
2842                     }
2843                 }
2844                 return BlitNtoNSurfaceAlpha;
2845
2846             case 4:
2847                 if(sf->Rmask == df->Rmask
2848                    && sf->Gmask == df->Gmask
2849                    && sf->Bmask == df->Bmask
2850                    && sf->BytesPerPixel == 4)
2851                 {
2852 #if MMX_ASMBLIT
2853                         if(sf->Rshift % 8 == 0
2854                            && sf->Gshift % 8 == 0
2855                            && sf->Bshift % 8 == 0
2856                            && SDL_HasMMX())
2857                             return BlitRGBtoRGBSurfaceAlphaMMX;
2858 #endif
2859 #ifdef __ARM_NEON__
2860                         if(sf->Rshift % 8 == 0
2861                            && sf->Gshift % 8 == 0
2862                            && sf->Bshift % 8 == 0)
2863                         {
2864                                 return BlitARGBtoXRGBalphaS_neon;
2865                         }
2866 #endif
2867                         if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2868                         {
2869 #if SDL_ALTIVEC_BLITTERS
2870                                 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2871                                         && SDL_HasAltiVec())
2872                                         return BlitRGBtoRGBSurfaceAlphaAltivec;
2873 #endif
2874                                 return BlitRGBtoRGBSurfaceAlpha;
2875                         }
2876                 }
2877 #ifdef __ARM_NEON__
2878                 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2879                     && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2880                 {
2881                         return BlitABGRtoXRGBalphaS_neon;
2882                 }
2883 #endif
2884 #if SDL_ALTIVEC_BLITTERS
2885                 if((sf->BytesPerPixel == 4) &&
2886                    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2887                         return Blit32to32SurfaceAlphaAltivec;
2888                 else
2889 #endif
2890                         return BlitNtoNSurfaceAlpha;
2891
2892             case 3:
2893             default:
2894                 return BlitNtoNSurfaceAlpha;
2895             }
2896         }
2897     } else {
2898         /* Per-pixel alpha blits */
2899         switch(df->BytesPerPixel) {
2900         case 1:
2901             return BlitNto1PixelAlpha;
2902
2903         case 2:
2904 #if SDL_ALTIVEC_BLITTERS
2905         if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2906            df->Gmask == 0x7e0 &&
2907            df->Bmask == 0x1f && SDL_HasAltiVec())
2908             return Blit32to565PixelAlphaAltivec;
2909         else
2910 #endif
2911 #ifdef __ARM_NEON__
2912             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2913                && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2914                 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2915                     return BlitARGBtoRGB565alpha_neon;
2916                 else
2917                     return BlitABGRtoRGB565alpha_neon;
2918             }
2919             else
2920 #endif
2921             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2922                && sf->Gmask == 0xff00
2923                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2924                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2925                 if(df->Gmask == 0x7e0)
2926                     return BlitARGBto565PixelAlpha;
2927                 else if(df->Gmask == 0x3e0)
2928                     return BlitARGBto555PixelAlpha;
2929             }
2930             return BlitNtoNPixelAlpha;
2931
2932         case 4:
2933             if(sf->Rmask == df->Rmask
2934                && sf->Gmask == df->Gmask
2935                && sf->Bmask == df->Bmask
2936                && sf->BytesPerPixel == 4)
2937             {
2938 #if MMX_ASMBLIT
2939                 if(sf->Rshift % 8 == 0
2940                    && sf->Gshift % 8 == 0
2941                    && sf->Bshift % 8 == 0
2942                    && sf->Ashift % 8 == 0
2943                    && sf->Aloss == 0)
2944                 {
2945                         if(SDL_Has3DNow())
2946                                 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2947                         if(SDL_HasMMX())
2948                                 return BlitRGBtoRGBPixelAlphaMMX;
2949                 }
2950 #endif
2951 #ifdef __ARM_NEON__
2952                 if(sf->Rshift % 8 == 0
2953                    && sf->Gshift % 8 == 0
2954                    && sf->Bshift % 8 == 0
2955                    && sf->Ashift % 8 == 0)
2956                 {
2957                         return BlitARGBtoXRGBalpha_neon;
2958                 }
2959 #endif
2960                 if(sf->Amask == 0xff000000)
2961                 {
2962 #if SDL_ALTIVEC_BLITTERS
2963                         if(!(surface->map->dst->flags & SDL_HWSURFACE)
2964                                 && SDL_HasAltiVec())
2965                                 return BlitRGBtoRGBPixelAlphaAltivec;
2966 #endif
2967                         return BlitRGBtoRGBPixelAlpha;
2968                 }
2969             }
2970 #ifdef __ARM_NEON__
2971             if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2972                 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2973                 && sf->Amask == 0xff000000)
2974             {
2975                 return BlitABGRtoXRGBalpha_neon;
2976             }
2977 #endif
2978 #if SDL_ALTIVEC_BLITTERS
2979             if (sf->Amask && sf->BytesPerPixel == 4 &&
2980                 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2981                 return Blit32to32PixelAlphaAltivec;
2982             else
2983 #endif
2984                 return BlitNtoNPixelAlpha;
2985
2986         case 3:
2987         default:
2988             return BlitNtoNPixelAlpha;
2989         }
2990     }
2991 }
2992