2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "SDL_config.h"
24 #include "SDL_video.h"
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
51 #endif /* SDL_ASSEMBLY_ROUTINES */
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
62 /* Functions to perform alpha blended blitting */
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int dstBpp = info->dst->BytesPerPixel; \
76 int srcstride = width * 4 + info->s_skip; \
77 int dststride = width * dstBpp + info->d_skip; \
79 while ( height-- ) { \
80 neon_name(dst, src, width); \
86 #define make_neon_callerS(name, neon_name) \
87 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
88 static void name(SDL_BlitInfo *info) \
90 int width = info->d_width; \
91 int height = info->d_height; \
92 Uint8 *src = info->s_pixels; \
93 Uint8 *dst = info->d_pixels; \
94 int srcskip = info->s_skip; \
95 int dstskip = info->d_skip; \
96 unsigned alpha = info->src->alpha;\
98 while ( height-- ) { \
99 neon_name(dst, src, width, alpha); \
100 src += width * 4 + srcskip; \
101 dst += width * 4 + dstskip; \
105 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
106 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
107 make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
108 make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
109 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
110 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
112 #endif /* __ARM_NEON__ */
114 /* N->1 blending with per-surface alpha */
115 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
117 int width = info->d_width;
118 int height = info->d_height;
119 Uint8 *src = info->s_pixels;
120 int srcskip = info->s_skip;
121 Uint8 *dst = info->d_pixels;
122 int dstskip = info->d_skip;
123 Uint8 *palmap = info->table;
124 SDL_PixelFormat *srcfmt = info->src;
125 SDL_PixelFormat *dstfmt = info->dst;
126 int srcbpp = srcfmt->BytesPerPixel;
128 const unsigned A = srcfmt->alpha;
140 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
141 dR = dstfmt->palette->colors[*dst].r;
142 dG = dstfmt->palette->colors[*dst].g;
143 dB = dstfmt->palette->colors[*dst].b;
144 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
148 /* Pack RGB into 8bit pixel */
149 if ( palmap == NULL ) {
150 *dst =((dR>>5)<<(3+2))|
154 *dst = palmap[((dR>>5)<<(3+2))|
167 /* N->1 blending with pixel alpha */
168 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
170 int width = info->d_width;
171 int height = info->d_height;
172 Uint8 *src = info->s_pixels;
173 int srcskip = info->s_skip;
174 Uint8 *dst = info->d_pixels;
175 int dstskip = info->d_skip;
176 Uint8 *palmap = info->table;
177 SDL_PixelFormat *srcfmt = info->src;
178 SDL_PixelFormat *dstfmt = info->dst;
179 int srcbpp = srcfmt->BytesPerPixel;
181 /* FIXME: fix alpha bit field expansion here too? */
193 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
194 dR = dstfmt->palette->colors[*dst].r;
195 dG = dstfmt->palette->colors[*dst].g;
196 dB = dstfmt->palette->colors[*dst].b;
197 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
201 /* Pack RGB into 8bit pixel */
202 if ( palmap == NULL ) {
203 *dst =((dR>>5)<<(3+2))|
207 *dst = palmap[((dR>>5)<<(3+2))|
220 /* colorkeyed N->1 blending with per-surface alpha */
221 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
223 int width = info->d_width;
224 int height = info->d_height;
225 Uint8 *src = info->s_pixels;
226 int srcskip = info->s_skip;
227 Uint8 *dst = info->d_pixels;
228 int dstskip = info->d_skip;
229 Uint8 *palmap = info->table;
230 SDL_PixelFormat *srcfmt = info->src;
231 SDL_PixelFormat *dstfmt = info->dst;
232 int srcbpp = srcfmt->BytesPerPixel;
233 Uint32 ckey = srcfmt->colorkey;
235 const int A = srcfmt->alpha;
247 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
248 if ( Pixel != ckey ) {
249 dR = dstfmt->palette->colors[*dst].r;
250 dG = dstfmt->palette->colors[*dst].g;
251 dB = dstfmt->palette->colors[*dst].b;
252 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
256 /* Pack RGB into 8bit pixel */
257 if ( palmap == NULL ) {
258 *dst =((dR>>5)<<(3+2))|
262 *dst = palmap[((dR>>5)<<(3+2))|
277 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
278 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
280 int width = info->d_width;
281 int height = info->d_height;
282 Uint32 *srcp = (Uint32 *)info->s_pixels;
283 int srcskip = info->s_skip >> 2;
284 Uint32 *dstp = (Uint32 *)info->d_pixels;
285 int dstskip = info->d_skip >> 2;
286 Uint32 dalpha = info->dst->Amask;
289 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
290 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
291 load = 0x0001010100010101ULL;/* !alpha128 mask */
292 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
293 movd_m2r(dalpha, mm7); /* dst alpha mask */
294 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
300 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
301 + (s & d & 0x00010101)) | dalpha;
303 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
304 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
306 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
307 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
309 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
310 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
311 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
312 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
313 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
314 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
315 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
317 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
318 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
328 /* fast RGB888->(A)RGB888 blending with surface alpha */
329 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
331 SDL_PixelFormat* df = info->dst;
332 unsigned alpha = info->src->alpha;
334 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
335 /* only call a128 version when R,G,B occupy lower bits */
336 BlitRGBtoRGBSurfaceAlpha128MMX(info);
338 int width = info->d_width;
339 int height = info->d_height;
340 Uint32 *srcp = (Uint32 *)info->s_pixels;
341 int srcskip = info->s_skip >> 2;
342 Uint32 *dstp = (Uint32 *)info->d_pixels;
343 int dstskip = info->d_skip >> 2;
345 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
346 /* form the alpha mult */
347 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
348 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
349 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
350 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
351 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
352 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
353 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
354 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
355 movd_m2r(df->Amask, mm7); /* dst alpha mask */
356 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
360 /* One Pixel Blend */
361 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
362 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
363 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
364 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
366 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
367 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
368 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
369 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
371 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
372 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
373 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
377 /* Two Pixels Blend */
378 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
379 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
380 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
381 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
383 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
384 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
385 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
386 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
388 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
389 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
390 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
391 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
393 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
394 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
395 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
396 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
398 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
399 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
401 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
413 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
414 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
416 int width = info->d_width;
417 int height = info->d_height;
418 Uint32 *srcp = (Uint32 *)info->s_pixels;
419 int srcskip = info->s_skip >> 2;
420 Uint32 *dstp = (Uint32 *)info->d_pixels;
421 int dstskip = info->d_skip >> 2;
422 SDL_PixelFormat* sf = info->src;
423 Uint32 amask = sf->Amask;
425 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
426 /* form multiplication mask */
427 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
428 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
429 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
430 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
431 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
432 /* form channel masks */
433 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
434 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
435 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
436 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
437 /* get alpha channel shift */
438 __asm__ __volatile__ (
440 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
444 Uint32 alpha = *srcp & amask;
445 /* FIXME: Here we special-case opaque alpha since the
446 compositioning used (>>8 instead of /255) doesn't handle
447 it correctly. Also special-case alpha=0 for speed?
451 } else if(alpha == amask) {
452 /* opaque alpha -- copy RGB, keep dst alpha */
453 /* using MMX here to free up regular registers for other things */
454 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
455 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
456 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
457 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
458 por_r2r(mm1, mm2); /* src | dst -> mm2 */
459 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
461 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
462 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
464 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
465 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
467 __asm__ __volatile__ (
469 : : "r" (alpha) ); /* 0000A000 -> mm4 */
470 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
471 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
472 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
473 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
476 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
477 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
478 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
479 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
481 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
482 movd_r2m(mm2, *dstp);/* mm2 -> dst */
492 /* End GCC_ASMBLIT */
495 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
496 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
498 int width = info->d_width;
499 int height = info->d_height;
500 Uint32 *srcp = (Uint32 *)info->s_pixels;
501 int srcskip = info->s_skip >> 2;
502 Uint32 *dstp = (Uint32 *)info->d_pixels;
503 int dstskip = info->d_skip >> 2;
504 Uint32 dalpha = info->dst->Amask;
506 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
508 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
509 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
510 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
517 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
518 + (s & d & 0x00010101)) | dalpha;
522 for (n >>= 1; n > 0; --n) {
523 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
524 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
526 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
527 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
529 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
530 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
531 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
532 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
534 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
535 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
536 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
537 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
539 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
550 /* fast RGB888->(A)RGB888 blending with surface alpha */
551 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
553 SDL_PixelFormat* df = info->dst;
554 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
555 unsigned alpha = info->src->alpha;
557 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
558 /* only call a128 version when R,G,B occupy lower bits */
559 BlitRGBtoRGBSurfaceAlpha128MMX(info);
561 int width = info->d_width;
562 int height = info->d_height;
563 Uint32 *srcp = (Uint32 *)info->s_pixels;
564 int srcskip = info->s_skip >> 2;
565 Uint32 *dstp = (Uint32 *)info->d_pixels;
566 int dstskip = info->d_skip >> 2;
567 Uint32 dalpha = df->Amask;
570 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
572 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
573 /* form the alpha mult */
574 amult = alpha | (alpha << 8);
575 amult = amult | (amult << 16);
576 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
577 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
578 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
579 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
580 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
585 /* One Pixel Blend */
586 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
587 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
589 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
590 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
592 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
593 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
594 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
595 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
597 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
598 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
599 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
607 for (n >>= 1; n > 0; --n) {
608 /* Two Pixels Blend */
609 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
610 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
611 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
612 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
614 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
615 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
616 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
617 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
619 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
620 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
621 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
622 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
624 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
625 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
626 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
627 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
629 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
630 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
632 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
644 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
645 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
647 int width = info->d_width;
648 int height = info->d_height;
649 Uint32 *srcp = (Uint32 *)info->s_pixels;
650 int srcskip = info->s_skip >> 2;
651 Uint32 *dstp = (Uint32 *)info->d_pixels;
652 int dstskip = info->d_skip >> 2;
653 SDL_PixelFormat* sf = info->src;
654 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
655 Uint32 amask = sf->Amask;
656 Uint32 ashift = sf->Ashift;
659 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
661 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
662 multmask = ~(0xFFFFi64 << (ashift * 2));
663 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
667 Uint32 alpha = *srcp & amask;
670 } else if (alpha == amask) {
671 /* opaque alpha -- copy RGB, keep dst alpha */
672 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
674 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
675 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
677 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
678 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
680 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
681 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
682 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
683 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
684 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
687 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
688 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
689 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
690 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
691 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
693 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
703 /* End MSVC_ASMBLIT */
705 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
707 #if SDL_ALTIVEC_BLITTERS
709 #pragma altivec_model on
716 #if (defined(__MACOSX__) && (__GNUC__ < 4))
717 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
718 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
719 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
720 (vector unsigned short) ( a,b,c,d,e,f,g,h )
722 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
723 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
724 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
725 (vector unsigned short) { a,b,c,d,e,f,g,h }
728 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
729 #define VECPRINT(msg, v) do { \
730 vector unsigned int tmpvec = (vector unsigned int)(v); \
731 unsigned int *vp = (unsigned int *)&tmpvec; \
732 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
735 /* the permuation vector that takes the high bytes out of all the appropriate shorts
736 (vector unsigned char)(
737 0x00, 0x10, 0x02, 0x12,
738 0x04, 0x14, 0x06, 0x16,
739 0x08, 0x18, 0x0A, 0x1A,
740 0x0C, 0x1C, 0x0E, 0x1E );
742 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
743 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
744 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
745 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
747 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
750 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
751 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
752 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
753 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
754 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
755 /* valpha2 is 255-alpha */ \
756 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
757 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
758 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
759 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
760 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
761 /* add source and dest */ \
762 vtemp1 = vec_add(vtemp1, vtemp3); \
763 vtemp2 = vec_add(vtemp2, vtemp4); \
764 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
765 vtemp1 = vec_add(vtemp1, v1_16); \
766 vtemp3 = vec_sr(vtemp1, v8_16); \
767 vtemp1 = vec_add(vtemp1, vtemp3); \
768 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
769 vtemp2 = vec_add(vtemp2, v1_16); \
770 vtemp4 = vec_sr(vtemp2, v8_16); \
771 vtemp2 = vec_add(vtemp2, vtemp4); \
772 /* (>>8) and get ARGBARGBARGBARGB */ \
773 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
776 /* Calculate the permute vector used for 32->32 swizzling */
777 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
778 const SDL_PixelFormat *dstfmt)
781 * We have to assume that the bits that aren't used by other
782 * colors is alpha, and it's one complete byte, since some formats
783 * leave alpha with a zero mask, but we should still swizzle the bits.
786 const static struct SDL_PixelFormat default_pixel_format = {
790 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
793 srcfmt = &default_pixel_format;
796 dstfmt = &default_pixel_format;
798 const vector unsigned char plus = VECUINT8_LITERAL
799 ( 0x00, 0x00, 0x00, 0x00,
800 0x04, 0x04, 0x04, 0x04,
801 0x08, 0x08, 0x08, 0x08,
802 0x0C, 0x0C, 0x0C, 0x0C );
803 vector unsigned char vswiz;
804 vector unsigned int srcvec;
805 #define RESHIFT(X) (3 - ((X) >> 3))
806 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
807 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
808 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
810 /* Use zero for alpha if either surface doesn't have alpha */
812 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
814 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
817 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
818 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
822 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
824 int height = info->d_height;
825 Uint8 *src = (Uint8 *)info->s_pixels;
826 int srcskip = info->s_skip;
827 Uint8 *dst = (Uint8 *)info->d_pixels;
828 int dstskip = info->d_skip;
829 SDL_PixelFormat *srcfmt = info->src;
831 vector unsigned char v0 = vec_splat_u8(0);
832 vector unsigned short v8_16 = vec_splat_u16(8);
833 vector unsigned short v1_16 = vec_splat_u16(1);
834 vector unsigned short v2_16 = vec_splat_u16(2);
835 vector unsigned short v3_16 = vec_splat_u16(3);
836 vector unsigned int v8_32 = vec_splat_u32(8);
837 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
838 vector unsigned short v3f = VECUINT16_LITERAL(
839 0x003f, 0x003f, 0x003f, 0x003f,
840 0x003f, 0x003f, 0x003f, 0x003f);
841 vector unsigned short vfc = VECUINT16_LITERAL(
842 0x00fc, 0x00fc, 0x00fc, 0x00fc,
843 0x00fc, 0x00fc, 0x00fc, 0x00fc);
846 0x10 - 0x1f is the alpha
847 0x00 - 0x0e evens are the red
848 0x01 - 0x0f odds are zero
850 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
851 0x10, 0x00, 0x01, 0x01,
852 0x10, 0x02, 0x01, 0x01,
853 0x10, 0x04, 0x01, 0x01,
854 0x10, 0x06, 0x01, 0x01
856 vector unsigned char vredalpha2 = (vector unsigned char)(
857 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
860 0x00 - 0x0f is ARxx ARxx ARxx ARxx
861 0x11 - 0x0f odds are blue
863 vector unsigned char vblue1 = VECUINT8_LITERAL(
864 0x00, 0x01, 0x02, 0x11,
865 0x04, 0x05, 0x06, 0x13,
866 0x08, 0x09, 0x0a, 0x15,
867 0x0c, 0x0d, 0x0e, 0x17
869 vector unsigned char vblue2 = (vector unsigned char)(
870 vec_add((vector unsigned int)vblue1, v8_32)
873 0x00 - 0x0f is ARxB ARxB ARxB ARxB
874 0x10 - 0x0e evens are green
876 vector unsigned char vgreen1 = VECUINT8_LITERAL(
877 0x00, 0x01, 0x10, 0x03,
878 0x04, 0x05, 0x12, 0x07,
879 0x08, 0x09, 0x14, 0x0b,
880 0x0c, 0x0d, 0x16, 0x0f
882 vector unsigned char vgreen2 = (vector unsigned char)(
883 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
885 vector unsigned char vgmerge = VECUINT8_LITERAL(
886 0x00, 0x02, 0x00, 0x06,
887 0x00, 0x0a, 0x00, 0x0e,
888 0x00, 0x12, 0x00, 0x16,
889 0x00, 0x1a, 0x00, 0x1e);
890 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
891 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
892 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
894 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
895 vf800 = vec_sl(vf800, vec_splat_u16(8));
899 vector unsigned char valigner;
900 vector unsigned char vsrc;
901 vector unsigned char voverflow;
902 int width = info->d_width;
904 #define ONE_PIXEL_BLEND(condition, widthvar) \
905 while (condition) { \
907 unsigned sR, sG, sB, dR, dG, dB, sA; \
908 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
910 unsigned short dstpixel = *((unsigned short *)dst); \
911 dR = (dstpixel >> 8) & 0xf8; \
912 dG = (dstpixel >> 3) & 0xfc; \
913 dB = (dstpixel << 3) & 0xf8; \
914 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
915 *((unsigned short *)dst) = ( \
916 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
923 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
924 extrawidth = (width % 8);
925 valigner = VEC_ALIGNER(src);
926 vsrc = (vector unsigned char)vec_ld(0, src);
929 vector unsigned char valpha;
930 vector unsigned char vsrc1, vsrc2;
931 vector unsigned char vdst1, vdst2;
932 vector unsigned short vR, vG, vB;
933 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
935 /* Load 8 pixels from src as ARGB */
936 voverflow = (vector unsigned char)vec_ld(15, src);
937 vsrc = vec_perm(vsrc, voverflow, valigner);
938 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
940 vsrc = (vector unsigned char)vec_ld(15, src);
941 voverflow = vec_perm(voverflow, vsrc, valigner);
942 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
945 /* Load 8 pixels from dst as XRGB */
946 voverflow = vec_ld(0, dst);
947 vR = vec_and((vector unsigned short)voverflow, vf800);
948 vB = vec_sl((vector unsigned short)voverflow, v3_16);
949 vG = vec_sl(vB, v2_16);
950 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
951 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
952 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
953 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
954 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
955 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
957 /* Alpha blend 8 pixels as ARGB */
958 valpha = vec_perm(vsrc1, v0, valphaPermute);
959 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
960 valpha = vec_perm(vsrc2, v0, valphaPermute);
961 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
963 /* Convert 8 pixels to 565 */
964 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
965 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
966 vgpixel = vec_and(vgpixel, vfc);
967 vgpixel = vec_sl(vgpixel, v3_16);
968 vrpixel = vec_sl(vpixel, v1_16);
969 vrpixel = vec_and(vrpixel, vf800);
970 vbpixel = vec_and(vpixel, v3f);
971 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
972 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
975 vec_st(vdst1, 0, dst);
980 ONE_PIXEL_BLEND((extrawidth), extrawidth);
981 #undef ONE_PIXEL_BLEND
987 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
989 unsigned alpha = info->src->alpha;
990 int height = info->d_height;
991 Uint32 *srcp = (Uint32 *)info->s_pixels;
992 int srcskip = info->s_skip >> 2;
993 Uint32 *dstp = (Uint32 *)info->d_pixels;
994 int dstskip = info->d_skip >> 2;
995 SDL_PixelFormat *srcfmt = info->src;
996 SDL_PixelFormat *dstfmt = info->dst;
997 unsigned sA = srcfmt->alpha;
998 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
999 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1000 Uint32 ckey = info->src->colorkey;
1001 vector unsigned char mergePermute;
1002 vector unsigned char vsrcPermute;
1003 vector unsigned char vdstPermute;
1004 vector unsigned char vsdstPermute;
1005 vector unsigned char valpha;
1006 vector unsigned char valphamask;
1007 vector unsigned char vbits;
1008 vector unsigned char v0;
1009 vector unsigned short v1;
1010 vector unsigned short v8;
1011 vector unsigned int vckey;
1012 vector unsigned int vrgbmask;
1014 mergePermute = VEC_MERGE_PERMUTE();
1015 v0 = vec_splat_u8(0);
1016 v1 = vec_splat_u16(1);
1017 v8 = vec_splat_u16(8);
1019 /* set the alpha to 255 on the destination surf */
1020 valphamask = VEC_ALPHA_MASK();
1022 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1023 vdstPermute = calc_swizzle32(NULL, dstfmt);
1024 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1026 /* set a vector full of alpha and 255-alpha */
1027 ((unsigned char *)&valpha)[0] = alpha;
1028 valpha = vec_splat(valpha, 0);
1029 vbits = (vector unsigned char)vec_splat_s8(-1);
1032 ((unsigned int *)(char*)&vckey)[0] = ckey;
1033 vckey = vec_splat(vckey, 0);
1034 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1035 vrgbmask = vec_splat(vrgbmask, 0);
1038 int width = info->d_width;
1039 #define ONE_PIXEL_BLEND(condition, widthvar) \
1040 while (condition) { \
1042 unsigned sR, sG, sB, dR, dG, dB; \
1043 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1044 if(sA && Pixel != ckey) { \
1045 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1046 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1047 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1048 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1054 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1056 int extrawidth = (width % 4);
1057 vector unsigned char valigner = VEC_ALIGNER(srcp);
1058 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1059 width -= extrawidth;
1061 vector unsigned char vsel;
1062 vector unsigned char voverflow;
1063 vector unsigned char vd;
1064 vector unsigned char vd_orig;
1067 voverflow = (vector unsigned char)vec_ld(15, srcp);
1068 vs = vec_perm(vs, voverflow, valigner);
1070 /* vsel is set for items that match the key */
1071 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1072 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1074 /* permute to source format */
1075 vs = vec_perm(vs, valpha, vsrcPermute);
1078 vd = (vector unsigned char)vec_ld(0, dstp);
1079 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1081 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1083 /* set the alpha channel to full on */
1084 vd = vec_or(vd, valphamask);
1086 /* mask out color key */
1087 vd = vec_sel(vd, vd_orig, vsel);
1089 /* permute to dest format */
1090 vd = vec_perm(vd, vbits, vdstPermute);
1093 vec_st((vector unsigned int)vd, 0, dstp);
1100 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1102 #undef ONE_PIXEL_BLEND
1110 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1112 int width = info->d_width;
1113 int height = info->d_height;
1114 Uint32 *srcp = (Uint32 *)info->s_pixels;
1115 int srcskip = info->s_skip >> 2;
1116 Uint32 *dstp = (Uint32 *)info->d_pixels;
1117 int dstskip = info->d_skip >> 2;
1118 SDL_PixelFormat *srcfmt = info->src;
1119 SDL_PixelFormat *dstfmt = info->dst;
1120 vector unsigned char mergePermute;
1121 vector unsigned char valphaPermute;
1122 vector unsigned char vsrcPermute;
1123 vector unsigned char vdstPermute;
1124 vector unsigned char vsdstPermute;
1125 vector unsigned char valphamask;
1126 vector unsigned char vpixelmask;
1127 vector unsigned char v0;
1128 vector unsigned short v1;
1129 vector unsigned short v8;
1131 v0 = vec_splat_u8(0);
1132 v1 = vec_splat_u16(1);
1133 v8 = vec_splat_u16(8);
1134 mergePermute = VEC_MERGE_PERMUTE();
1135 valphamask = VEC_ALPHA_MASK();
1136 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1137 vpixelmask = vec_nor(valphamask, v0);
1138 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1139 vdstPermute = calc_swizzle32(NULL, dstfmt);
1140 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1142 while ( height-- ) {
1143 width = info->d_width;
1144 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1146 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1147 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1149 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1150 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1151 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1157 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1161 int extrawidth = (width % 4);
1162 vector unsigned char valigner = VEC_ALIGNER(srcp);
1163 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1164 width -= extrawidth;
1166 vector unsigned char voverflow;
1167 vector unsigned char vd;
1168 vector unsigned char valpha;
1169 vector unsigned char vdstalpha;
1171 voverflow = (vector unsigned char)vec_ld(15, srcp);
1172 vs = vec_perm(vs, voverflow, valigner);
1173 vs = vec_perm(vs, v0, vsrcPermute);
1175 valpha = vec_perm(vs, v0, valphaPermute);
1178 vd = (vector unsigned char)vec_ld(0, dstp);
1179 vd = vec_perm(vd, v0, vsdstPermute);
1180 vdstalpha = vec_and(vd, valphamask);
1182 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1184 /* set the alpha to the dest alpha */
1185 vd = vec_and(vd, vpixelmask);
1186 vd = vec_or(vd, vdstalpha);
1187 vd = vec_perm(vd, v0, vdstPermute);
1190 vec_st((vector unsigned int)vd, 0, dstp);
1198 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1202 #undef ONE_PIXEL_BLEND
1206 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1207 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1209 int width = info->d_width;
1210 int height = info->d_height;
1211 Uint32 *srcp = (Uint32 *)info->s_pixels;
1212 int srcskip = info->s_skip >> 2;
1213 Uint32 *dstp = (Uint32 *)info->d_pixels;
1214 int dstskip = info->d_skip >> 2;
1215 vector unsigned char mergePermute;
1216 vector unsigned char valphaPermute;
1217 vector unsigned char valphamask;
1218 vector unsigned char vpixelmask;
1219 vector unsigned char v0;
1220 vector unsigned short v1;
1221 vector unsigned short v8;
1222 v0 = vec_splat_u8(0);
1223 v1 = vec_splat_u16(1);
1224 v8 = vec_splat_u16(8);
1225 mergePermute = VEC_MERGE_PERMUTE();
1226 valphamask = VEC_ALPHA_MASK();
1227 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1230 vpixelmask = vec_nor(valphamask, v0);
1232 width = info->d_width;
1233 #define ONE_PIXEL_BLEND(condition, widthvar) \
1234 while ((condition)) { \
1240 Uint32 alpha = s >> 24; \
1242 if(alpha == SDL_ALPHA_OPAQUE) { \
1243 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1246 dalpha = d & 0xff000000; \
1247 s1 = s & 0xff00ff; \
1248 d1 = d & 0xff00ff; \
1249 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1252 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1253 *dstp = d1 | d | dalpha; \
1260 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1262 int extrawidth = (width % 4);
1263 vector unsigned char valigner = VEC_ALIGNER(srcp);
1264 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1265 width -= extrawidth;
1267 vector unsigned char voverflow;
1268 vector unsigned char vd;
1269 vector unsigned char valpha;
1270 vector unsigned char vdstalpha;
1272 voverflow = (vector unsigned char)vec_ld(15, srcp);
1273 vs = vec_perm(vs, voverflow, valigner);
1275 valpha = vec_perm(vs, v0, valphaPermute);
1278 vd = (vector unsigned char)vec_ld(0, dstp);
1279 vdstalpha = vec_and(vd, valphamask);
1281 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1283 /* set the alpha to the dest alpha */
1284 vd = vec_and(vd, vpixelmask);
1285 vd = vec_or(vd, vdstalpha);
1288 vec_st((vector unsigned int)vd, 0, dstp);
1295 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1300 #undef ONE_PIXEL_BLEND
1303 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1306 unsigned alpha = info->src->alpha;
1307 int height = info->d_height;
1308 Uint32 *srcp = (Uint32 *)info->s_pixels;
1309 int srcskip = info->s_skip >> 2;
1310 Uint32 *dstp = (Uint32 *)info->d_pixels;
1311 int dstskip = info->d_skip >> 2;
1312 SDL_PixelFormat *srcfmt = info->src;
1313 SDL_PixelFormat *dstfmt = info->dst;
1314 unsigned sA = srcfmt->alpha;
1315 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1316 vector unsigned char mergePermute;
1317 vector unsigned char vsrcPermute;
1318 vector unsigned char vdstPermute;
1319 vector unsigned char vsdstPermute;
1320 vector unsigned char valpha;
1321 vector unsigned char valphamask;
1322 vector unsigned char vbits;
1323 vector unsigned short v1;
1324 vector unsigned short v8;
1326 mergePermute = VEC_MERGE_PERMUTE();
1327 v1 = vec_splat_u16(1);
1328 v8 = vec_splat_u16(8);
1330 /* set the alpha to 255 on the destination surf */
1331 valphamask = VEC_ALPHA_MASK();
1333 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1334 vdstPermute = calc_swizzle32(NULL, dstfmt);
1335 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1337 /* set a vector full of alpha and 255-alpha */
1338 ((unsigned char *)&valpha)[0] = alpha;
1339 valpha = vec_splat(valpha, 0);
1340 vbits = (vector unsigned char)vec_splat_s8(-1);
1343 int width = info->d_width;
1344 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1346 unsigned sR, sG, sB, dR, dG, dB; \
1347 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1348 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1349 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1350 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1355 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1357 int extrawidth = (width % 4);
1358 vector unsigned char valigner = VEC_ALIGNER(srcp);
1359 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1360 width -= extrawidth;
1362 vector unsigned char voverflow;
1363 vector unsigned char vd;
1366 voverflow = (vector unsigned char)vec_ld(15, srcp);
1367 vs = vec_perm(vs, voverflow, valigner);
1368 vs = vec_perm(vs, valpha, vsrcPermute);
1371 vd = (vector unsigned char)vec_ld(0, dstp);
1372 vd = vec_perm(vd, vd, vsdstPermute);
1374 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1376 /* set the alpha channel to full on */
1377 vd = vec_or(vd, valphamask);
1378 vd = vec_perm(vd, vbits, vdstPermute);
1381 vec_st((vector unsigned int)vd, 0, dstp);
1388 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1390 #undef ONE_PIXEL_BLEND
1399 /* fast RGB888->(A)RGB888 blending */
1400 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1402 unsigned alpha = info->src->alpha;
1403 int height = info->d_height;
1404 Uint32 *srcp = (Uint32 *)info->s_pixels;
1405 int srcskip = info->s_skip >> 2;
1406 Uint32 *dstp = (Uint32 *)info->d_pixels;
1407 int dstskip = info->d_skip >> 2;
1408 vector unsigned char mergePermute;
1409 vector unsigned char valpha;
1410 vector unsigned char valphamask;
1411 vector unsigned short v1;
1412 vector unsigned short v8;
1414 mergePermute = VEC_MERGE_PERMUTE();
1415 v1 = vec_splat_u16(1);
1416 v8 = vec_splat_u16(8);
1418 /* set the alpha to 255 on the destination surf */
1419 valphamask = VEC_ALPHA_MASK();
1421 /* set a vector full of alpha and 255-alpha */
1422 ((unsigned char *)&valpha)[0] = alpha;
1423 valpha = vec_splat(valpha, 0);
1426 int width = info->d_width;
1427 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1430 Uint32 s1 = s & 0xff00ff; \
1431 Uint32 d1 = d & 0xff00ff; \
1432 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1436 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1437 *dstp = d1 | d | 0xff000000; \
1442 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1444 int extrawidth = (width % 4);
1445 vector unsigned char valigner = VEC_ALIGNER(srcp);
1446 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1447 width -= extrawidth;
1449 vector unsigned char voverflow;
1450 vector unsigned char vd;
1453 voverflow = (vector unsigned char)vec_ld(15, srcp);
1454 vs = vec_perm(vs, voverflow, valigner);
1457 vd = (vector unsigned char)vec_ld(0, dstp);
1459 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1461 /* set the alpha channel to full on */
1462 vd = vec_or(vd, valphamask);
1465 vec_st((vector unsigned int)vd, 0, dstp);
1472 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1474 #undef ONE_PIXEL_BLEND
1481 #pragma altivec_model off
1483 #endif /* SDL_ALTIVEC_BLITTERS */
1485 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1486 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1488 int width = info->d_width;
1489 int height = info->d_height;
1490 Uint32 *srcp = (Uint32 *)info->s_pixels;
1491 int srcskip = info->s_skip >> 2;
1492 Uint32 *dstp = (Uint32 *)info->d_pixels;
1493 int dstskip = info->d_skip >> 2;
1499 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1500 + (s & d & 0x00010101)) | 0xff000000;
1507 /* fast RGB888->(A)RGB888 blending with surface alpha */
1508 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1510 unsigned alpha = info->src->alpha;
1512 BlitRGBtoRGBSurfaceAlpha128(info);
1514 int width = info->d_width;
1515 int height = info->d_height;
1516 Uint32 *srcp = (Uint32 *)info->s_pixels;
1517 int srcskip = info->s_skip >> 2;
1518 Uint32 *dstp = (Uint32 *)info->d_pixels;
1519 int dstskip = info->d_skip >> 2;
1526 DUFFS_LOOP_DOUBLE2({
1527 /* One Pixel Blend */
1532 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1536 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1537 *dstp = d1 | d | 0xff000000;
1541 /* Two Pixels Blend */
1546 d1 += (s1 - d1) * alpha >> 8;
1549 s = ((s & 0xff00) >> 8) |
1550 ((srcp[1] & 0xff00) << 8);
1551 d = ((d & 0xff00) >> 8) |
1552 ((dstp[1] & 0xff00) << 8);
1553 d += (s - d) * alpha >> 8;
1556 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1563 d1 += (s1 - d1) * alpha >> 8;
1566 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1577 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1579 int width = info->d_width;
1580 int height = info->d_height;
1581 Uint32 *srcp = (Uint32 *)info->s_pixels;
1582 int srcskip = info->s_skip >> 2;
1583 Uint32 *dstp = (Uint32 *)info->d_pixels;
1584 int dstskip = info->d_skip >> 2;
1593 Uint32 alpha = s >> 24;
1594 /* FIXME: Here we special-case opaque alpha since the
1595 compositioning used (>>8 instead of /255) doesn't handle
1596 it correctly. Also special-case alpha=0 for speed?
1599 if(alpha == SDL_ALPHA_OPAQUE) {
1600 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1603 * take out the middle component (green), and process
1604 * the other two in parallel. One multiply less.
1607 dalpha = d & 0xff000000;
1610 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1613 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1614 *dstp = d1 | d | dalpha;
1626 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1627 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1629 int width = info->d_width;
1630 int height = info->d_height;
1631 Uint32 *srcp = (Uint32 *)info->s_pixels;
1632 int srcskip = info->s_skip >> 2;
1633 Uint32 *dstp = (Uint32 *)info->d_pixels;
1634 int dstskip = info->d_skip >> 2;
1635 SDL_PixelFormat* sf = info->src;
1636 Uint32 amask = sf->Amask;
1639 /* make mm6 all zeros. */
1640 "pxor %%mm6, %%mm6\n"
1642 /* Make a mask to preserve the alpha. */
1643 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1644 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1645 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1646 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1647 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1649 /* form channel masks */
1650 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1651 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1652 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1653 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1655 /* get alpha channel shift */
1656 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1658 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1668 : : "r" (srcp), "r" (dstp) );
1670 alpha = *srcp & amask;
1671 /* FIXME: Here we special-case opaque alpha since the
1672 compositioning used (>>8 instead of /255) doesn't handle
1673 it correctly. Also special-case alpha=0 for speed?
1678 else if(alpha == amask) {
1679 /* opaque alpha -- copy RGB, keep dst alpha */
1680 /* using MMX here to free up regular registers for other things */
1682 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1683 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1684 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1685 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1686 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1687 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1689 : : "r" (srcp), "r" (dstp) );
1694 /* load in the source, and dst. */
1695 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1696 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1698 /* Move the src alpha into mm2 */
1700 /* if supporting pshufw */
1701 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1702 /*"psrlw $8, %%mm2\n" */
1706 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1707 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1708 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1709 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1711 /* move the colors into words. */
1712 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1713 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1716 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1719 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1720 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1721 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1723 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1725 "movd %%mm0, (%1)\n" /* result in mm0 */
1727 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1741 /* End GCC_ASMBLIT*/
1744 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1745 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1747 int width = info->d_width;
1748 int height = info->d_height;
1749 Uint32 *srcp = (Uint32 *)info->s_pixels;
1750 int srcskip = info->s_skip >> 2;
1751 Uint32 *dstp = (Uint32 *)info->d_pixels;
1752 int dstskip = info->d_skip >> 2;
1753 SDL_PixelFormat* sf = info->src;
1754 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1755 Uint32 amask = sf->Amask;
1756 Uint32 ashift = sf->Ashift;
1759 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1761 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1762 multmask = ~(0xFFFFi64 << (ashift * 2));
1763 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1769 _m_prefetch(srcp + 16);
1770 _m_prefetch(dstp + 16);
1772 alpha = *srcp & amask;
1775 } else if (alpha == amask) {
1776 /* copy RGB, keep dst alpha */
1777 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1779 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1780 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1782 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1783 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1785 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1786 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1787 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1788 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1789 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1792 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1793 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1794 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1795 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1796 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1798 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1808 /* End MSVC_ASMBLIT */
1810 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1812 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1814 /* blend a single 16 bit pixel at 50% */
1815 #define BLEND16_50(d, s, mask) \
1816 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1818 /* blend two 16 bit pixels at 50% */
1819 #define BLEND2x16_50(d, s, mask) \
1820 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1821 + (s & d & (~(mask | mask << 16))))
1823 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1825 int width = info->d_width;
1826 int height = info->d_height;
1827 Uint16 *srcp = (Uint16 *)info->s_pixels;
1828 int srcskip = info->s_skip >> 1;
1829 Uint16 *dstp = (Uint16 *)info->d_pixels;
1830 int dstskip = info->d_skip >> 1;
1833 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1835 * Source and destination not aligned, pipeline it.
1836 * This is mostly a win for big blits but no loss for
1842 /* handle odd destination */
1843 if((uintptr_t)dstp & 2) {
1844 Uint16 d = *dstp, s = *srcp;
1845 *dstp = BLEND16_50(d, s, mask);
1850 srcp++; /* srcp is now 32-bit aligned */
1852 /* bootstrap pipeline with first halfword */
1853 prev_sw = ((Uint32 *)srcp)[-1];
1857 sw = *(Uint32 *)srcp;
1858 dw = *(Uint32 *)dstp;
1859 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1860 s = (prev_sw << 16) + (sw >> 16);
1862 s = (prev_sw >> 16) + (sw << 16);
1865 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1871 /* final pixel if any */
1873 Uint16 d = *dstp, s;
1874 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1875 s = (Uint16)prev_sw;
1877 s = (Uint16)(prev_sw >> 16);
1879 *dstp = BLEND16_50(d, s, mask);
1883 srcp += srcskip - 1;
1886 /* source and destination are aligned */
1889 /* first odd pixel? */
1890 if((uintptr_t)srcp & 2) {
1891 Uint16 d = *dstp, s = *srcp;
1892 *dstp = BLEND16_50(d, s, mask);
1897 /* srcp and dstp are now 32-bit aligned */
1900 Uint32 sw = *(Uint32 *)srcp;
1901 Uint32 dw = *(Uint32 *)dstp;
1902 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1908 /* last odd pixel? */
1910 Uint16 d = *dstp, s = *srcp;
1911 *dstp = BLEND16_50(d, s, mask);
1922 /* fast RGB565->RGB565 blending with surface alpha */
1923 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1925 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1927 Blit16to16SurfaceAlpha128(info, 0xf7de);
1929 int width = info->d_width;
1930 int height = info->d_height;
1931 Uint16 *srcp = (Uint16 *)info->s_pixels;
1932 int srcskip = info->s_skip >> 1;
1933 Uint16 *dstp = (Uint16 *)info->d_pixels;
1934 int dstskip = info->d_skip >> 1;
1938 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1940 alpha >>= 3; /* downscale alpha to 5 bits */
1942 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1943 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1944 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1945 /* position alpha to allow for mullo and mulhi on diff channels
1946 to reduce the number of operations */
1949 /* Setup the 565 color channel masks */
1950 load = 0x07E007E007E007E0ULL;
1951 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1952 load = 0x001F001F001F001FULL;
1953 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1960 * shift out the middle component (green) to
1961 * the high 16 bits, and process all three RGB
1962 * components at the same time.
1964 s = (s | s << 16) & 0x07e0f81f;
1965 d = (d | d << 16) & 0x07e0f81f;
1966 d += (s - d) * alpha >> 5;
1968 *dstp++ = d | d >> 16;
1973 * shift out the middle component (green) to
1974 * the high 16 bits, and process all three RGB
1975 * components at the same time.
1977 s = (s | s << 16) & 0x07e0f81f;
1978 d = (d | d << 16) & 0x07e0f81f;
1979 d += (s - d) * alpha >> 5;
1981 *dstp++ = d | d >> 16;
1985 * shift out the middle component (green) to
1986 * the high 16 bits, and process all three RGB
1987 * components at the same time.
1989 s = (s | s << 16) & 0x07e0f81f;
1990 d = (d | d << 16) & 0x07e0f81f;
1991 d += (s - d) * alpha >> 5;
1993 *dstp++ = d | d >> 16;
1995 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1996 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1998 /* red -- does not need a mask since the right shift clears
1999 the uninteresting bits */
2000 movq_r2r(mm2, mm5); /* src -> mm5 */
2001 movq_r2r(mm3, mm6); /* dst -> mm6 */
2002 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2003 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2006 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2007 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2008 /* alpha used is actually 11 bits
2009 11 + 5 = 16 bits, so the sign bits are lost */
2010 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2011 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2012 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2014 movq_r2r(mm6, mm1); /* save new reds in dsts */
2016 /* green -- process the bits in place */
2017 movq_r2r(mm2, mm5); /* src -> mm5 */
2018 movq_r2r(mm3, mm6); /* dst -> mm6 */
2019 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2020 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2023 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2024 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2025 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2026 bits are gone and the sign bits present */
2027 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2028 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2030 por_r2r(mm6, mm1); /* save new greens in dsts */
2033 movq_r2r(mm2, mm5); /* src -> mm5 */
2034 movq_r2r(mm3, mm6); /* dst -> mm6 */
2035 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2036 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2039 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2040 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2041 /* 11 + 5 = 16 bits, so the sign bits are lost and
2042 the interesting bits will need to be MASKed */
2043 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2044 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2045 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2047 por_r2r(mm6, mm1); /* save new blues in dsts */
2049 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2061 /* fast RGB555->RGB555 blending with surface alpha */
2062 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2064 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2066 Blit16to16SurfaceAlpha128(info, 0xfbde);
2068 int width = info->d_width;
2069 int height = info->d_height;
2070 Uint16 *srcp = (Uint16 *)info->s_pixels;
2071 int srcskip = info->s_skip >> 1;
2072 Uint16 *dstp = (Uint16 *)info->d_pixels;
2073 int dstskip = info->d_skip >> 1;
2077 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2079 alpha >>= 3; /* downscale alpha to 5 bits */
2081 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2082 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2083 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2084 /* position alpha to allow for mullo and mulhi on diff channels
2085 to reduce the number of operations */
2088 /* Setup the 555 color channel masks */
2089 load = 0x03E003E003E003E0ULL;
2090 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2091 load = 0x001F001F001F001FULL;
2092 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2099 * shift out the middle component (green) to
2100 * the high 16 bits, and process all three RGB
2101 * components at the same time.
2103 s = (s | s << 16) & 0x03e07c1f;
2104 d = (d | d << 16) & 0x03e07c1f;
2105 d += (s - d) * alpha >> 5;
2107 *dstp++ = d | d >> 16;
2112 * shift out the middle component (green) to
2113 * the high 16 bits, and process all three RGB
2114 * components at the same time.
2116 s = (s | s << 16) & 0x03e07c1f;
2117 d = (d | d << 16) & 0x03e07c1f;
2118 d += (s - d) * alpha >> 5;
2120 *dstp++ = d | d >> 16;
2124 * shift out the middle component (green) to
2125 * the high 16 bits, and process all three RGB
2126 * components at the same time.
2128 s = (s | s << 16) & 0x03e07c1f;
2129 d = (d | d << 16) & 0x03e07c1f;
2130 d += (s - d) * alpha >> 5;
2132 *dstp++ = d | d >> 16;
2134 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2135 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2137 /* red -- process the bits in place */
2138 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2139 /* by reusing the GREEN mask we free up another mmx
2140 register to accumulate the result */
2142 movq_r2r(mm2, mm5); /* src -> mm5 */
2143 movq_r2r(mm3, mm6); /* dst -> mm6 */
2144 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2145 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2148 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2149 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2150 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2151 cleared by a MASK below */
2152 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2153 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2154 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2156 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2158 movq_r2r(mm6, mm1); /* save new reds in dsts */
2160 /* green -- process the bits in place */
2161 movq_r2r(mm2, mm5); /* src -> mm5 */
2162 movq_r2r(mm3, mm6); /* dst -> mm6 */
2163 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2164 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2167 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2168 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2169 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2170 bits are gone and the sign bits present */
2171 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2172 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2174 por_r2r(mm6, mm1); /* save new greens in dsts */
2177 movq_r2r(mm2, mm5); /* src -> mm5 */
2178 movq_r2r(mm3, mm6); /* dst -> mm6 */
2179 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2180 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2183 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2184 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2185 /* 11 + 5 = 16 bits, so the sign bits are lost and
2186 the interesting bits will need to be MASKed */
2187 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2188 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2189 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2191 por_r2r(mm6, mm1); /* save new blues in dsts */
2193 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2204 /* End GCC_ASMBLIT */
2207 /* fast RGB565->RGB565 blending with surface alpha */
2208 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2210 unsigned alpha = info->src->alpha;
2212 Blit16to16SurfaceAlpha128(info, 0xf7de);
2214 int width = info->d_width;
2215 int height = info->d_height;
2216 Uint16 *srcp = (Uint16 *)info->s_pixels;
2217 int srcskip = info->s_skip >> 1;
2218 Uint16 *dstp = (Uint16 *)info->d_pixels;
2219 int dstskip = info->d_skip >> 1;
2222 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2224 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2225 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2226 alpha >>= 3; /* downscale alpha to 5 bits */
2228 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2229 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2230 /* position alpha to allow for mullo and mulhi on diff channels
2231 to reduce the number of operations */
2232 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2234 /* Setup the 565 color channel masks */
2235 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2236 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2244 * shift out the middle component (green) to
2245 * the high 16 bits, and process all three RGB
2246 * components at the same time.
2248 s = (s | s << 16) & 0x07e0f81f;
2249 d = (d | d << 16) & 0x07e0f81f;
2250 d += (s - d) * alpha >> 5;
2252 *dstp++ = (Uint16)(d | d >> 16);
2257 * shift out the middle component (green) to
2258 * the high 16 bits, and process all three RGB
2259 * components at the same time.
2261 s = (s | s << 16) & 0x07e0f81f;
2262 d = (d | d << 16) & 0x07e0f81f;
2263 d += (s - d) * alpha >> 5;
2265 *dstp++ = (Uint16)(d | d >> 16);
2269 * shift out the middle component (green) to
2270 * the high 16 bits, and process all three RGB
2271 * components at the same time.
2273 s = (s | s << 16) & 0x07e0f81f;
2274 d = (d | d << 16) & 0x07e0f81f;
2275 d += (s - d) * alpha >> 5;
2277 *dstp++ = (Uint16)(d | d >> 16);
2279 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2280 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2284 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2287 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2290 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2291 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2292 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2293 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2294 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2296 mm_res = dst2; /* RED -> mm_res */
2298 /* green -- process the bits in place */
2300 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2303 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2306 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2307 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2308 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2309 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2311 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2315 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2318 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2321 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2322 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2323 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2324 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2325 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2327 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2329 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2341 /* fast RGB555->RGB555 blending with surface alpha */
2342 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2344 unsigned alpha = info->src->alpha;
2346 Blit16to16SurfaceAlpha128(info, 0xfbde);
2348 int width = info->d_width;
2349 int height = info->d_height;
2350 Uint16 *srcp = (Uint16 *)info->s_pixels;
2351 int srcskip = info->s_skip >> 1;
2352 Uint16 *dstp = (Uint16 *)info->d_pixels;
2353 int dstskip = info->d_skip >> 1;
2356 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2358 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2359 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2360 alpha >>= 3; /* downscale alpha to 5 bits */
2362 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2363 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2364 /* position alpha to allow for mullo and mulhi on diff channels
2365 to reduce the number of operations */
2366 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2368 /* Setup the 555 color channel masks */
2369 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2370 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2371 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2379 * shift out the middle component (green) to
2380 * the high 16 bits, and process all three RGB
2381 * components at the same time.
2383 s = (s | s << 16) & 0x03e07c1f;
2384 d = (d | d << 16) & 0x03e07c1f;
2385 d += (s - d) * alpha >> 5;
2387 *dstp++ = (Uint16)(d | d >> 16);
2392 * shift out the middle component (green) to
2393 * the high 16 bits, and process all three RGB
2394 * components at the same time.
2396 s = (s | s << 16) & 0x03e07c1f;
2397 d = (d | d << 16) & 0x03e07c1f;
2398 d += (s - d) * alpha >> 5;
2400 *dstp++ = (Uint16)(d | d >> 16);
2404 * shift out the middle component (green) to
2405 * the high 16 bits, and process all three RGB
2406 * components at the same time.
2408 s = (s | s << 16) & 0x03e07c1f;
2409 d = (d | d << 16) & 0x03e07c1f;
2410 d += (s - d) * alpha >> 5;
2412 *dstp++ = (Uint16)(d | d >> 16);
2414 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2415 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2417 /* red -- process the bits in place */
2419 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2422 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2425 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2426 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2427 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2428 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2429 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2431 mm_res = dst2; /* RED -> mm_res */
2433 /* green -- process the bits in place */
2435 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2438 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2441 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2442 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2443 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2444 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2446 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2449 src2 = src1; /* src -> src2 */
2450 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2452 dst2 = dst1; /* dst -> dst2 */
2453 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2456 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2457 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2458 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2459 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2460 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2462 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2464 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2475 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2477 /* fast RGB565->RGB565 blending with surface alpha */
2478 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2480 unsigned alpha = info->src->alpha;
2482 Blit16to16SurfaceAlpha128(info, 0xf7de);
2484 int width = info->d_width;
2485 int height = info->d_height;
2486 Uint16 *srcp = (Uint16 *)info->s_pixels;
2487 int srcskip = info->s_skip >> 1;
2488 Uint16 *dstp = (Uint16 *)info->d_pixels;
2489 int dstskip = info->d_skip >> 1;
2490 alpha >>= 3; /* downscale alpha to 5 bits */
2497 * shift out the middle component (green) to
2498 * the high 16 bits, and process all three RGB
2499 * components at the same time.
2501 s = (s | s << 16) & 0x07e0f81f;
2502 d = (d | d << 16) & 0x07e0f81f;
2503 d += (s - d) * alpha >> 5;
2505 *dstp++ = (Uint16)(d | d >> 16);
2513 /* fast RGB555->RGB555 blending with surface alpha */
2514 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2516 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2518 Blit16to16SurfaceAlpha128(info, 0xfbde);
2520 int width = info->d_width;
2521 int height = info->d_height;
2522 Uint16 *srcp = (Uint16 *)info->s_pixels;
2523 int srcskip = info->s_skip >> 1;
2524 Uint16 *dstp = (Uint16 *)info->d_pixels;
2525 int dstskip = info->d_skip >> 1;
2526 alpha >>= 3; /* downscale alpha to 5 bits */
2533 * shift out the middle component (green) to
2534 * the high 16 bits, and process all three RGB
2535 * components at the same time.
2537 s = (s | s << 16) & 0x03e07c1f;
2538 d = (d | d << 16) & 0x03e07c1f;
2539 d += (s - d) * alpha >> 5;
2541 *dstp++ = (Uint16)(d | d >> 16);
2549 /* fast ARGB8888->RGB565 blending with pixel alpha */
2550 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2552 int width = info->d_width;
2553 int height = info->d_height;
2554 Uint32 *srcp = (Uint32 *)info->s_pixels;
2555 int srcskip = info->s_skip >> 2;
2556 Uint16 *dstp = (Uint16 *)info->d_pixels;
2557 int dstskip = info->d_skip >> 1;
2562 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2563 /* FIXME: Here we special-case opaque alpha since the
2564 compositioning used (>>8 instead of /255) doesn't handle
2565 it correctly. Also special-case alpha=0 for speed?
2568 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2569 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2573 * convert source and destination to G0RAB65565
2574 * and blend all components at the same time
2576 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2578 d = (d | d << 16) & 0x07e0f81f;
2579 d += (s - d) * alpha >> 5;
2581 *dstp = (Uint16)(d | d >> 16);
2592 /* fast ARGB8888->RGB555 blending with pixel alpha */
2593 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2595 int width = info->d_width;
2596 int height = info->d_height;
2597 Uint32 *srcp = (Uint32 *)info->s_pixels;
2598 int srcskip = info->s_skip >> 2;
2599 Uint16 *dstp = (Uint16 *)info->d_pixels;
2600 int dstskip = info->d_skip >> 1;
2606 alpha = s >> 27; /* downscale alpha to 5 bits */
2607 /* FIXME: Here we special-case opaque alpha since the
2608 compositioning used (>>8 instead of /255) doesn't handle
2609 it correctly. Also special-case alpha=0 for speed?
2612 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2613 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2617 * convert source and destination to G0RAB65565
2618 * and blend all components at the same time
2620 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2622 d = (d | d << 16) & 0x03e07c1f;
2623 d += (s - d) * alpha >> 5;
2625 *dstp = (Uint16)(d | d >> 16);
2636 /* General (slow) N->N blending with per-surface alpha */
2637 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2639 int width = info->d_width;
2640 int height = info->d_height;
2641 Uint8 *src = info->s_pixels;
2642 int srcskip = info->s_skip;
2643 Uint8 *dst = info->d_pixels;
2644 int dstskip = info->d_skip;
2645 SDL_PixelFormat *srcfmt = info->src;
2646 SDL_PixelFormat *dstfmt = info->dst;
2647 int srcbpp = srcfmt->BytesPerPixel;
2648 int dstbpp = dstfmt->BytesPerPixel;
2649 unsigned sA = srcfmt->alpha;
2650 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2653 while ( height-- ) {
2663 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2664 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2665 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2666 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2677 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2678 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2680 int width = info->d_width;
2681 int height = info->d_height;
2682 Uint8 *src = info->s_pixels;
2683 int srcskip = info->s_skip;
2684 Uint8 *dst = info->d_pixels;
2685 int dstskip = info->d_skip;
2686 SDL_PixelFormat *srcfmt = info->src;
2687 SDL_PixelFormat *dstfmt = info->dst;
2688 Uint32 ckey = srcfmt->colorkey;
2689 int srcbpp = srcfmt->BytesPerPixel;
2690 int dstbpp = dstfmt->BytesPerPixel;
2691 unsigned sA = srcfmt->alpha;
2692 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2694 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2695 Uint16 *src16 = (Uint16 *)src;
2696 Uint16 *dst16 = (Uint16 *)dst;
2697 sA >>= 3; /* downscale alpha to 5 bits */
2698 while ( height-- ) {
2704 if(sA && s != ckey) {
2706 s = (s | s << 16) & 0x07e0f81f;
2707 d = (d | d << 16) & 0x07e0f81f;
2708 d += (s - d) * sA >> 5;
2710 *dst16 = (Uint16)(d | d >> 16);
2716 src16 += srcskip / 2;
2717 dst16 += dstskip / 2;
2722 while ( height-- ) {
2732 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2733 if(sA && Pixel != ckey) {
2734 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2735 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2736 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2737 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2748 /* General (slow) N->N blending with pixel alpha */
2749 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2751 int width = info->d_width;
2752 int height = info->d_height;
2753 Uint8 *src = info->s_pixels;
2754 int srcskip = info->s_skip;
2755 Uint8 *dst = info->d_pixels;
2756 int dstskip = info->d_skip;
2757 SDL_PixelFormat *srcfmt = info->src;
2758 SDL_PixelFormat *dstfmt = info->dst;
2763 /* Set up some basic variables */
2764 srcbpp = srcfmt->BytesPerPixel;
2765 dstbpp = dstfmt->BytesPerPixel;
2767 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2768 quite right. for <8bpp source alpha, it gets them very wrong
2770 It is unclear whether there is a good general solution that doesn't
2771 need a branch (or a divide). */
2772 while ( height-- ) {
2784 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2786 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2787 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2788 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2800 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2802 SDL_PixelFormat *sf = surface->format;
2803 SDL_PixelFormat *df = surface->map->dst->format;
2805 if(sf->Amask == 0) {
2806 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2807 if(df->BytesPerPixel == 1)
2808 return BlitNto1SurfaceAlphaKey;
2810 #if SDL_ALTIVEC_BLITTERS
2811 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2812 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2813 return Blit32to32SurfaceAlphaKeyAltivec;
2816 return BlitNtoNSurfaceAlphaKey;
2818 /* Per-surface alpha blits */
2819 switch(df->BytesPerPixel) {
2821 return BlitNto1SurfaceAlpha;
2824 if(surface->map->identity) {
2825 if(df->Gmask == 0x7e0)
2829 return Blit565to565SurfaceAlphaMMX;
2832 return Blit565to565SurfaceAlpha;
2834 else if(df->Gmask == 0x3e0)
2838 return Blit555to555SurfaceAlphaMMX;
2841 return Blit555to555SurfaceAlpha;
2844 return BlitNtoNSurfaceAlpha;
2847 if(sf->Rmask == df->Rmask
2848 && sf->Gmask == df->Gmask
2849 && sf->Bmask == df->Bmask
2850 && sf->BytesPerPixel == 4)
2853 if(sf->Rshift % 8 == 0
2854 && sf->Gshift % 8 == 0
2855 && sf->Bshift % 8 == 0
2857 return BlitRGBtoRGBSurfaceAlphaMMX;
2860 if(sf->Rshift % 8 == 0
2861 && sf->Gshift % 8 == 0
2862 && sf->Bshift % 8 == 0)
2864 return BlitARGBtoXRGBalphaS_neon;
2867 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2869 #if SDL_ALTIVEC_BLITTERS
2870 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2871 && SDL_HasAltiVec())
2872 return BlitRGBtoRGBSurfaceAlphaAltivec;
2874 return BlitRGBtoRGBSurfaceAlpha;
2878 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2879 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2881 return BlitABGRtoXRGBalphaS_neon;
2884 #if SDL_ALTIVEC_BLITTERS
2885 if((sf->BytesPerPixel == 4) &&
2886 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2887 return Blit32to32SurfaceAlphaAltivec;
2890 return BlitNtoNSurfaceAlpha;
2894 return BlitNtoNSurfaceAlpha;
2898 /* Per-pixel alpha blits */
2899 switch(df->BytesPerPixel) {
2901 return BlitNto1PixelAlpha;
2904 #if SDL_ALTIVEC_BLITTERS
2905 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2906 df->Gmask == 0x7e0 &&
2907 df->Bmask == 0x1f && SDL_HasAltiVec())
2908 return Blit32to565PixelAlphaAltivec;
2912 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2913 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2914 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2915 return BlitARGBtoRGB565alpha_neon;
2917 return BlitABGRtoRGB565alpha_neon;
2921 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2922 && sf->Gmask == 0xff00
2923 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2924 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2925 if(df->Gmask == 0x7e0)
2926 return BlitARGBto565PixelAlpha;
2927 else if(df->Gmask == 0x3e0)
2928 return BlitARGBto555PixelAlpha;
2930 return BlitNtoNPixelAlpha;
2933 if(sf->Rmask == df->Rmask
2934 && sf->Gmask == df->Gmask
2935 && sf->Bmask == df->Bmask
2936 && sf->BytesPerPixel == 4)
2939 if(sf->Rshift % 8 == 0
2940 && sf->Gshift % 8 == 0
2941 && sf->Bshift % 8 == 0
2942 && sf->Ashift % 8 == 0
2946 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2948 return BlitRGBtoRGBPixelAlphaMMX;
2952 if(sf->Rshift % 8 == 0
2953 && sf->Gshift % 8 == 0
2954 && sf->Bshift % 8 == 0
2955 && sf->Ashift % 8 == 0)
2957 return BlitARGBtoXRGBalpha_neon;
2960 if(sf->Amask == 0xff000000)
2962 #if SDL_ALTIVEC_BLITTERS
2963 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2964 && SDL_HasAltiVec())
2965 return BlitRGBtoRGBPixelAlphaAltivec;
2967 return BlitRGBtoRGBPixelAlpha;
2971 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2972 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2973 && sf->Amask == 0xff000000)
2975 return BlitABGRtoXRGBalpha_neon;
2978 #if SDL_ALTIVEC_BLITTERS
2979 if (sf->Amask && sf->BytesPerPixel == 4 &&
2980 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2981 return Blit32to32PixelAlphaAltivec;
2984 return BlitNtoNPixelAlpha;
2988 return BlitNtoNPixelAlpha;