2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "SDL_config.h"
24 #include "SDL_video.h"
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
51 #endif /* SDL_ASSEMBLY_ROUTINES */
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
62 /* Functions to perform alpha blended blitting */
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int srcskip = info->s_skip; \
76 int dstskip = info->d_skip; \
78 while ( height-- ) { \
79 neon_name(dst, src, width); \
80 src += width * 4 + srcskip; \
81 dst += width * 4 + dstskip; \
85 #define make_neon_callerS(name, neon_name) \
86 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
87 static void name(SDL_BlitInfo *info) \
89 int width = info->d_width; \
90 int height = info->d_height; \
91 Uint8 *src = info->s_pixels; \
92 Uint8 *dst = info->d_pixels; \
93 int srcskip = info->s_skip; \
94 int dstskip = info->d_skip; \
95 unsigned alpha = info->src->alpha;\
97 while ( height-- ) { \
98 neon_name(dst, src, width, alpha); \
99 src += width * 4 + srcskip; \
100 dst += width * 4 + dstskip; \
104 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
105 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
106 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
107 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
109 #endif /* __ARM_NEON__ */
111 /* N->1 blending with per-surface alpha */
112 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
114 int width = info->d_width;
115 int height = info->d_height;
116 Uint8 *src = info->s_pixels;
117 int srcskip = info->s_skip;
118 Uint8 *dst = info->d_pixels;
119 int dstskip = info->d_skip;
120 Uint8 *palmap = info->table;
121 SDL_PixelFormat *srcfmt = info->src;
122 SDL_PixelFormat *dstfmt = info->dst;
123 int srcbpp = srcfmt->BytesPerPixel;
125 const unsigned A = srcfmt->alpha;
137 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
138 dR = dstfmt->palette->colors[*dst].r;
139 dG = dstfmt->palette->colors[*dst].g;
140 dB = dstfmt->palette->colors[*dst].b;
141 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
145 /* Pack RGB into 8bit pixel */
146 if ( palmap == NULL ) {
147 *dst =((dR>>5)<<(3+2))|
151 *dst = palmap[((dR>>5)<<(3+2))|
164 /* N->1 blending with pixel alpha */
165 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
167 int width = info->d_width;
168 int height = info->d_height;
169 Uint8 *src = info->s_pixels;
170 int srcskip = info->s_skip;
171 Uint8 *dst = info->d_pixels;
172 int dstskip = info->d_skip;
173 Uint8 *palmap = info->table;
174 SDL_PixelFormat *srcfmt = info->src;
175 SDL_PixelFormat *dstfmt = info->dst;
176 int srcbpp = srcfmt->BytesPerPixel;
178 /* FIXME: fix alpha bit field expansion here too? */
190 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
191 dR = dstfmt->palette->colors[*dst].r;
192 dG = dstfmt->palette->colors[*dst].g;
193 dB = dstfmt->palette->colors[*dst].b;
194 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
198 /* Pack RGB into 8bit pixel */
199 if ( palmap == NULL ) {
200 *dst =((dR>>5)<<(3+2))|
204 *dst = palmap[((dR>>5)<<(3+2))|
217 /* colorkeyed N->1 blending with per-surface alpha */
218 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
220 int width = info->d_width;
221 int height = info->d_height;
222 Uint8 *src = info->s_pixels;
223 int srcskip = info->s_skip;
224 Uint8 *dst = info->d_pixels;
225 int dstskip = info->d_skip;
226 Uint8 *palmap = info->table;
227 SDL_PixelFormat *srcfmt = info->src;
228 SDL_PixelFormat *dstfmt = info->dst;
229 int srcbpp = srcfmt->BytesPerPixel;
230 Uint32 ckey = srcfmt->colorkey;
232 const int A = srcfmt->alpha;
244 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
245 if ( Pixel != ckey ) {
246 dR = dstfmt->palette->colors[*dst].r;
247 dG = dstfmt->palette->colors[*dst].g;
248 dB = dstfmt->palette->colors[*dst].b;
249 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
253 /* Pack RGB into 8bit pixel */
254 if ( palmap == NULL ) {
255 *dst =((dR>>5)<<(3+2))|
259 *dst = palmap[((dR>>5)<<(3+2))|
274 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
275 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
277 int width = info->d_width;
278 int height = info->d_height;
279 Uint32 *srcp = (Uint32 *)info->s_pixels;
280 int srcskip = info->s_skip >> 2;
281 Uint32 *dstp = (Uint32 *)info->d_pixels;
282 int dstskip = info->d_skip >> 2;
283 Uint32 dalpha = info->dst->Amask;
286 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
287 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
288 load = 0x0001010100010101ULL;/* !alpha128 mask */
289 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
290 movd_m2r(dalpha, mm7); /* dst alpha mask */
291 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
297 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
298 + (s & d & 0x00010101)) | dalpha;
300 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
301 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
303 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
304 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
306 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
307 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
308 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
309 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
310 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
311 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
312 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
314 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
315 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
325 /* fast RGB888->(A)RGB888 blending with surface alpha */
326 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
328 SDL_PixelFormat* df = info->dst;
329 unsigned alpha = info->src->alpha;
331 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
332 /* only call a128 version when R,G,B occupy lower bits */
333 BlitRGBtoRGBSurfaceAlpha128MMX(info);
335 int width = info->d_width;
336 int height = info->d_height;
337 Uint32 *srcp = (Uint32 *)info->s_pixels;
338 int srcskip = info->s_skip >> 2;
339 Uint32 *dstp = (Uint32 *)info->d_pixels;
340 int dstskip = info->d_skip >> 2;
342 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
343 /* form the alpha mult */
344 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
345 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
346 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
347 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
348 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
349 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
350 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
351 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
352 movd_m2r(df->Amask, mm7); /* dst alpha mask */
353 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
357 /* One Pixel Blend */
358 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
359 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
360 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
361 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
363 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
364 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
365 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
366 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
368 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
369 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
370 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
374 /* Two Pixels Blend */
375 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
376 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
377 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
378 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
380 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
381 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
382 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
383 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
385 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
386 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
387 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
388 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
390 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
391 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
392 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
393 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
395 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
396 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
398 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
410 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
411 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
413 int width = info->d_width;
414 int height = info->d_height;
415 Uint32 *srcp = (Uint32 *)info->s_pixels;
416 int srcskip = info->s_skip >> 2;
417 Uint32 *dstp = (Uint32 *)info->d_pixels;
418 int dstskip = info->d_skip >> 2;
419 SDL_PixelFormat* sf = info->src;
420 Uint32 amask = sf->Amask;
422 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
423 /* form multiplication mask */
424 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
425 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
426 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
427 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
428 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
429 /* form channel masks */
430 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
431 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
432 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
433 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
434 /* get alpha channel shift */
435 __asm__ __volatile__ (
437 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
441 Uint32 alpha = *srcp & amask;
442 /* FIXME: Here we special-case opaque alpha since the
443 compositioning used (>>8 instead of /255) doesn't handle
444 it correctly. Also special-case alpha=0 for speed?
448 } else if(alpha == amask) {
449 /* opaque alpha -- copy RGB, keep dst alpha */
450 /* using MMX here to free up regular registers for other things */
451 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
452 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
453 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
454 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
455 por_r2r(mm1, mm2); /* src | dst -> mm2 */
456 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
458 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
459 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
461 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
462 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
464 __asm__ __volatile__ (
466 : : "r" (alpha) ); /* 0000A000 -> mm4 */
467 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
468 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
469 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
470 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
473 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
474 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
475 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
476 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
478 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
479 movd_r2m(mm2, *dstp);/* mm2 -> dst */
489 /* End GCC_ASMBLIT */
492 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
493 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
495 int width = info->d_width;
496 int height = info->d_height;
497 Uint32 *srcp = (Uint32 *)info->s_pixels;
498 int srcskip = info->s_skip >> 2;
499 Uint32 *dstp = (Uint32 *)info->d_pixels;
500 int dstskip = info->d_skip >> 2;
501 Uint32 dalpha = info->dst->Amask;
503 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
505 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
506 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
507 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
514 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
515 + (s & d & 0x00010101)) | dalpha;
519 for (n >>= 1; n > 0; --n) {
520 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
521 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
523 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
524 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
526 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
527 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
528 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
529 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
531 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
532 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
533 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
534 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
536 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
547 /* fast RGB888->(A)RGB888 blending with surface alpha */
548 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
550 SDL_PixelFormat* df = info->dst;
551 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
552 unsigned alpha = info->src->alpha;
554 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
555 /* only call a128 version when R,G,B occupy lower bits */
556 BlitRGBtoRGBSurfaceAlpha128MMX(info);
558 int width = info->d_width;
559 int height = info->d_height;
560 Uint32 *srcp = (Uint32 *)info->s_pixels;
561 int srcskip = info->s_skip >> 2;
562 Uint32 *dstp = (Uint32 *)info->d_pixels;
563 int dstskip = info->d_skip >> 2;
564 Uint32 dalpha = df->Amask;
567 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
569 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
570 /* form the alpha mult */
571 amult = alpha | (alpha << 8);
572 amult = amult | (amult << 16);
573 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
574 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
575 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
576 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
577 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
582 /* One Pixel Blend */
583 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
584 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
586 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
587 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
589 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
590 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
591 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
592 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
594 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
595 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
596 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
604 for (n >>= 1; n > 0; --n) {
605 /* Two Pixels Blend */
606 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
607 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
608 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
609 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
611 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
612 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
613 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
614 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
616 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
617 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
618 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
619 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
621 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
622 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
623 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
624 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
626 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
627 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
629 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
641 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
642 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
644 int width = info->d_width;
645 int height = info->d_height;
646 Uint32 *srcp = (Uint32 *)info->s_pixels;
647 int srcskip = info->s_skip >> 2;
648 Uint32 *dstp = (Uint32 *)info->d_pixels;
649 int dstskip = info->d_skip >> 2;
650 SDL_PixelFormat* sf = info->src;
651 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
652 Uint32 amask = sf->Amask;
653 Uint32 ashift = sf->Ashift;
656 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
658 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
659 multmask = ~(0xFFFFi64 << (ashift * 2));
660 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
664 Uint32 alpha = *srcp & amask;
667 } else if (alpha == amask) {
668 /* opaque alpha -- copy RGB, keep dst alpha */
669 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
671 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
672 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
674 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
675 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
677 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
678 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
679 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
680 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
681 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
684 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
685 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
686 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
687 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
688 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
690 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
700 /* End MSVC_ASMBLIT */
702 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
704 #if SDL_ALTIVEC_BLITTERS
706 #pragma altivec_model on
713 #if (defined(__MACOSX__) && (__GNUC__ < 4))
714 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
715 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
716 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
717 (vector unsigned short) ( a,b,c,d,e,f,g,h )
719 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
720 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
721 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
722 (vector unsigned short) { a,b,c,d,e,f,g,h }
725 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
726 #define VECPRINT(msg, v) do { \
727 vector unsigned int tmpvec = (vector unsigned int)(v); \
728 unsigned int *vp = (unsigned int *)&tmpvec; \
729 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
732 /* the permuation vector that takes the high bytes out of all the appropriate shorts
733 (vector unsigned char)(
734 0x00, 0x10, 0x02, 0x12,
735 0x04, 0x14, 0x06, 0x16,
736 0x08, 0x18, 0x0A, 0x1A,
737 0x0C, 0x1C, 0x0E, 0x1E );
739 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
740 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
741 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
742 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
744 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
747 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
748 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
749 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
750 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
751 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
752 /* valpha2 is 255-alpha */ \
753 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
754 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
755 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
756 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
757 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
758 /* add source and dest */ \
759 vtemp1 = vec_add(vtemp1, vtemp3); \
760 vtemp2 = vec_add(vtemp2, vtemp4); \
761 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
762 vtemp1 = vec_add(vtemp1, v1_16); \
763 vtemp3 = vec_sr(vtemp1, v8_16); \
764 vtemp1 = vec_add(vtemp1, vtemp3); \
765 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
766 vtemp2 = vec_add(vtemp2, v1_16); \
767 vtemp4 = vec_sr(vtemp2, v8_16); \
768 vtemp2 = vec_add(vtemp2, vtemp4); \
769 /* (>>8) and get ARGBARGBARGBARGB */ \
770 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
773 /* Calculate the permute vector used for 32->32 swizzling */
774 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
775 const SDL_PixelFormat *dstfmt)
778 * We have to assume that the bits that aren't used by other
779 * colors is alpha, and it's one complete byte, since some formats
780 * leave alpha with a zero mask, but we should still swizzle the bits.
783 const static struct SDL_PixelFormat default_pixel_format = {
787 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
790 srcfmt = &default_pixel_format;
793 dstfmt = &default_pixel_format;
795 const vector unsigned char plus = VECUINT8_LITERAL
796 ( 0x00, 0x00, 0x00, 0x00,
797 0x04, 0x04, 0x04, 0x04,
798 0x08, 0x08, 0x08, 0x08,
799 0x0C, 0x0C, 0x0C, 0x0C );
800 vector unsigned char vswiz;
801 vector unsigned int srcvec;
802 #define RESHIFT(X) (3 - ((X) >> 3))
803 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
804 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
805 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
807 /* Use zero for alpha if either surface doesn't have alpha */
809 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
811 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
814 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
815 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
819 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
821 int height = info->d_height;
822 Uint8 *src = (Uint8 *)info->s_pixels;
823 int srcskip = info->s_skip;
824 Uint8 *dst = (Uint8 *)info->d_pixels;
825 int dstskip = info->d_skip;
826 SDL_PixelFormat *srcfmt = info->src;
828 vector unsigned char v0 = vec_splat_u8(0);
829 vector unsigned short v8_16 = vec_splat_u16(8);
830 vector unsigned short v1_16 = vec_splat_u16(1);
831 vector unsigned short v2_16 = vec_splat_u16(2);
832 vector unsigned short v3_16 = vec_splat_u16(3);
833 vector unsigned int v8_32 = vec_splat_u32(8);
834 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
835 vector unsigned short v3f = VECUINT16_LITERAL(
836 0x003f, 0x003f, 0x003f, 0x003f,
837 0x003f, 0x003f, 0x003f, 0x003f);
838 vector unsigned short vfc = VECUINT16_LITERAL(
839 0x00fc, 0x00fc, 0x00fc, 0x00fc,
840 0x00fc, 0x00fc, 0x00fc, 0x00fc);
843 0x10 - 0x1f is the alpha
844 0x00 - 0x0e evens are the red
845 0x01 - 0x0f odds are zero
847 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
848 0x10, 0x00, 0x01, 0x01,
849 0x10, 0x02, 0x01, 0x01,
850 0x10, 0x04, 0x01, 0x01,
851 0x10, 0x06, 0x01, 0x01
853 vector unsigned char vredalpha2 = (vector unsigned char)(
854 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
857 0x00 - 0x0f is ARxx ARxx ARxx ARxx
858 0x11 - 0x0f odds are blue
860 vector unsigned char vblue1 = VECUINT8_LITERAL(
861 0x00, 0x01, 0x02, 0x11,
862 0x04, 0x05, 0x06, 0x13,
863 0x08, 0x09, 0x0a, 0x15,
864 0x0c, 0x0d, 0x0e, 0x17
866 vector unsigned char vblue2 = (vector unsigned char)(
867 vec_add((vector unsigned int)vblue1, v8_32)
870 0x00 - 0x0f is ARxB ARxB ARxB ARxB
871 0x10 - 0x0e evens are green
873 vector unsigned char vgreen1 = VECUINT8_LITERAL(
874 0x00, 0x01, 0x10, 0x03,
875 0x04, 0x05, 0x12, 0x07,
876 0x08, 0x09, 0x14, 0x0b,
877 0x0c, 0x0d, 0x16, 0x0f
879 vector unsigned char vgreen2 = (vector unsigned char)(
880 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
882 vector unsigned char vgmerge = VECUINT8_LITERAL(
883 0x00, 0x02, 0x00, 0x06,
884 0x00, 0x0a, 0x00, 0x0e,
885 0x00, 0x12, 0x00, 0x16,
886 0x00, 0x1a, 0x00, 0x1e);
887 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
888 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
889 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
891 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
892 vf800 = vec_sl(vf800, vec_splat_u16(8));
896 vector unsigned char valigner;
897 vector unsigned char vsrc;
898 vector unsigned char voverflow;
899 int width = info->d_width;
901 #define ONE_PIXEL_BLEND(condition, widthvar) \
902 while (condition) { \
904 unsigned sR, sG, sB, dR, dG, dB, sA; \
905 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
907 unsigned short dstpixel = *((unsigned short *)dst); \
908 dR = (dstpixel >> 8) & 0xf8; \
909 dG = (dstpixel >> 3) & 0xfc; \
910 dB = (dstpixel << 3) & 0xf8; \
911 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
912 *((unsigned short *)dst) = ( \
913 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
920 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
921 extrawidth = (width % 8);
922 valigner = VEC_ALIGNER(src);
923 vsrc = (vector unsigned char)vec_ld(0, src);
926 vector unsigned char valpha;
927 vector unsigned char vsrc1, vsrc2;
928 vector unsigned char vdst1, vdst2;
929 vector unsigned short vR, vG, vB;
930 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
932 /* Load 8 pixels from src as ARGB */
933 voverflow = (vector unsigned char)vec_ld(15, src);
934 vsrc = vec_perm(vsrc, voverflow, valigner);
935 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
937 vsrc = (vector unsigned char)vec_ld(15, src);
938 voverflow = vec_perm(voverflow, vsrc, valigner);
939 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
942 /* Load 8 pixels from dst as XRGB */
943 voverflow = vec_ld(0, dst);
944 vR = vec_and((vector unsigned short)voverflow, vf800);
945 vB = vec_sl((vector unsigned short)voverflow, v3_16);
946 vG = vec_sl(vB, v2_16);
947 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
948 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
949 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
950 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
951 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
952 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
954 /* Alpha blend 8 pixels as ARGB */
955 valpha = vec_perm(vsrc1, v0, valphaPermute);
956 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
957 valpha = vec_perm(vsrc2, v0, valphaPermute);
958 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
960 /* Convert 8 pixels to 565 */
961 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
962 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
963 vgpixel = vec_and(vgpixel, vfc);
964 vgpixel = vec_sl(vgpixel, v3_16);
965 vrpixel = vec_sl(vpixel, v1_16);
966 vrpixel = vec_and(vrpixel, vf800);
967 vbpixel = vec_and(vpixel, v3f);
968 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
969 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
972 vec_st(vdst1, 0, dst);
977 ONE_PIXEL_BLEND((extrawidth), extrawidth);
978 #undef ONE_PIXEL_BLEND
984 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
986 unsigned alpha = info->src->alpha;
987 int height = info->d_height;
988 Uint32 *srcp = (Uint32 *)info->s_pixels;
989 int srcskip = info->s_skip >> 2;
990 Uint32 *dstp = (Uint32 *)info->d_pixels;
991 int dstskip = info->d_skip >> 2;
992 SDL_PixelFormat *srcfmt = info->src;
993 SDL_PixelFormat *dstfmt = info->dst;
994 unsigned sA = srcfmt->alpha;
995 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
996 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
997 Uint32 ckey = info->src->colorkey;
998 vector unsigned char mergePermute;
999 vector unsigned char vsrcPermute;
1000 vector unsigned char vdstPermute;
1001 vector unsigned char vsdstPermute;
1002 vector unsigned char valpha;
1003 vector unsigned char valphamask;
1004 vector unsigned char vbits;
1005 vector unsigned char v0;
1006 vector unsigned short v1;
1007 vector unsigned short v8;
1008 vector unsigned int vckey;
1009 vector unsigned int vrgbmask;
1011 mergePermute = VEC_MERGE_PERMUTE();
1012 v0 = vec_splat_u8(0);
1013 v1 = vec_splat_u16(1);
1014 v8 = vec_splat_u16(8);
1016 /* set the alpha to 255 on the destination surf */
1017 valphamask = VEC_ALPHA_MASK();
1019 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1020 vdstPermute = calc_swizzle32(NULL, dstfmt);
1021 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1023 /* set a vector full of alpha and 255-alpha */
1024 ((unsigned char *)&valpha)[0] = alpha;
1025 valpha = vec_splat(valpha, 0);
1026 vbits = (vector unsigned char)vec_splat_s8(-1);
1029 ((unsigned int *)(char*)&vckey)[0] = ckey;
1030 vckey = vec_splat(vckey, 0);
1031 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1032 vrgbmask = vec_splat(vrgbmask, 0);
1035 int width = info->d_width;
1036 #define ONE_PIXEL_BLEND(condition, widthvar) \
1037 while (condition) { \
1039 unsigned sR, sG, sB, dR, dG, dB; \
1040 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1041 if(sA && Pixel != ckey) { \
1042 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1043 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1044 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1045 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1051 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1053 int extrawidth = (width % 4);
1054 vector unsigned char valigner = VEC_ALIGNER(srcp);
1055 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1056 width -= extrawidth;
1058 vector unsigned char vsel;
1059 vector unsigned char voverflow;
1060 vector unsigned char vd;
1061 vector unsigned char vd_orig;
1064 voverflow = (vector unsigned char)vec_ld(15, srcp);
1065 vs = vec_perm(vs, voverflow, valigner);
1067 /* vsel is set for items that match the key */
1068 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1069 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1071 /* permute to source format */
1072 vs = vec_perm(vs, valpha, vsrcPermute);
1075 vd = (vector unsigned char)vec_ld(0, dstp);
1076 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1078 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1080 /* set the alpha channel to full on */
1081 vd = vec_or(vd, valphamask);
1083 /* mask out color key */
1084 vd = vec_sel(vd, vd_orig, vsel);
1086 /* permute to dest format */
1087 vd = vec_perm(vd, vbits, vdstPermute);
1090 vec_st((vector unsigned int)vd, 0, dstp);
1097 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1099 #undef ONE_PIXEL_BLEND
1107 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1109 int width = info->d_width;
1110 int height = info->d_height;
1111 Uint32 *srcp = (Uint32 *)info->s_pixels;
1112 int srcskip = info->s_skip >> 2;
1113 Uint32 *dstp = (Uint32 *)info->d_pixels;
1114 int dstskip = info->d_skip >> 2;
1115 SDL_PixelFormat *srcfmt = info->src;
1116 SDL_PixelFormat *dstfmt = info->dst;
1117 vector unsigned char mergePermute;
1118 vector unsigned char valphaPermute;
1119 vector unsigned char vsrcPermute;
1120 vector unsigned char vdstPermute;
1121 vector unsigned char vsdstPermute;
1122 vector unsigned char valphamask;
1123 vector unsigned char vpixelmask;
1124 vector unsigned char v0;
1125 vector unsigned short v1;
1126 vector unsigned short v8;
1128 v0 = vec_splat_u8(0);
1129 v1 = vec_splat_u16(1);
1130 v8 = vec_splat_u16(8);
1131 mergePermute = VEC_MERGE_PERMUTE();
1132 valphamask = VEC_ALPHA_MASK();
1133 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1134 vpixelmask = vec_nor(valphamask, v0);
1135 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1136 vdstPermute = calc_swizzle32(NULL, dstfmt);
1137 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1139 while ( height-- ) {
1140 width = info->d_width;
1141 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1143 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1144 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1146 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1147 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1148 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1154 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1158 int extrawidth = (width % 4);
1159 vector unsigned char valigner = VEC_ALIGNER(srcp);
1160 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1161 width -= extrawidth;
1163 vector unsigned char voverflow;
1164 vector unsigned char vd;
1165 vector unsigned char valpha;
1166 vector unsigned char vdstalpha;
1168 voverflow = (vector unsigned char)vec_ld(15, srcp);
1169 vs = vec_perm(vs, voverflow, valigner);
1170 vs = vec_perm(vs, v0, vsrcPermute);
1172 valpha = vec_perm(vs, v0, valphaPermute);
1175 vd = (vector unsigned char)vec_ld(0, dstp);
1176 vd = vec_perm(vd, v0, vsdstPermute);
1177 vdstalpha = vec_and(vd, valphamask);
1179 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1181 /* set the alpha to the dest alpha */
1182 vd = vec_and(vd, vpixelmask);
1183 vd = vec_or(vd, vdstalpha);
1184 vd = vec_perm(vd, v0, vdstPermute);
1187 vec_st((vector unsigned int)vd, 0, dstp);
1195 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1199 #undef ONE_PIXEL_BLEND
1203 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1204 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1206 int width = info->d_width;
1207 int height = info->d_height;
1208 Uint32 *srcp = (Uint32 *)info->s_pixels;
1209 int srcskip = info->s_skip >> 2;
1210 Uint32 *dstp = (Uint32 *)info->d_pixels;
1211 int dstskip = info->d_skip >> 2;
1212 vector unsigned char mergePermute;
1213 vector unsigned char valphaPermute;
1214 vector unsigned char valphamask;
1215 vector unsigned char vpixelmask;
1216 vector unsigned char v0;
1217 vector unsigned short v1;
1218 vector unsigned short v8;
1219 v0 = vec_splat_u8(0);
1220 v1 = vec_splat_u16(1);
1221 v8 = vec_splat_u16(8);
1222 mergePermute = VEC_MERGE_PERMUTE();
1223 valphamask = VEC_ALPHA_MASK();
1224 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1227 vpixelmask = vec_nor(valphamask, v0);
1229 width = info->d_width;
1230 #define ONE_PIXEL_BLEND(condition, widthvar) \
1231 while ((condition)) { \
1237 Uint32 alpha = s >> 24; \
1239 if(alpha == SDL_ALPHA_OPAQUE) { \
1240 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1243 dalpha = d & 0xff000000; \
1244 s1 = s & 0xff00ff; \
1245 d1 = d & 0xff00ff; \
1246 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1249 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1250 *dstp = d1 | d | dalpha; \
1257 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1259 int extrawidth = (width % 4);
1260 vector unsigned char valigner = VEC_ALIGNER(srcp);
1261 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1262 width -= extrawidth;
1264 vector unsigned char voverflow;
1265 vector unsigned char vd;
1266 vector unsigned char valpha;
1267 vector unsigned char vdstalpha;
1269 voverflow = (vector unsigned char)vec_ld(15, srcp);
1270 vs = vec_perm(vs, voverflow, valigner);
1272 valpha = vec_perm(vs, v0, valphaPermute);
1275 vd = (vector unsigned char)vec_ld(0, dstp);
1276 vdstalpha = vec_and(vd, valphamask);
1278 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1280 /* set the alpha to the dest alpha */
1281 vd = vec_and(vd, vpixelmask);
1282 vd = vec_or(vd, vdstalpha);
1285 vec_st((vector unsigned int)vd, 0, dstp);
1292 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1297 #undef ONE_PIXEL_BLEND
1300 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1303 unsigned alpha = info->src->alpha;
1304 int height = info->d_height;
1305 Uint32 *srcp = (Uint32 *)info->s_pixels;
1306 int srcskip = info->s_skip >> 2;
1307 Uint32 *dstp = (Uint32 *)info->d_pixels;
1308 int dstskip = info->d_skip >> 2;
1309 SDL_PixelFormat *srcfmt = info->src;
1310 SDL_PixelFormat *dstfmt = info->dst;
1311 unsigned sA = srcfmt->alpha;
1312 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1313 vector unsigned char mergePermute;
1314 vector unsigned char vsrcPermute;
1315 vector unsigned char vdstPermute;
1316 vector unsigned char vsdstPermute;
1317 vector unsigned char valpha;
1318 vector unsigned char valphamask;
1319 vector unsigned char vbits;
1320 vector unsigned short v1;
1321 vector unsigned short v8;
1323 mergePermute = VEC_MERGE_PERMUTE();
1324 v1 = vec_splat_u16(1);
1325 v8 = vec_splat_u16(8);
1327 /* set the alpha to 255 on the destination surf */
1328 valphamask = VEC_ALPHA_MASK();
1330 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1331 vdstPermute = calc_swizzle32(NULL, dstfmt);
1332 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1334 /* set a vector full of alpha and 255-alpha */
1335 ((unsigned char *)&valpha)[0] = alpha;
1336 valpha = vec_splat(valpha, 0);
1337 vbits = (vector unsigned char)vec_splat_s8(-1);
1340 int width = info->d_width;
1341 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1343 unsigned sR, sG, sB, dR, dG, dB; \
1344 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1345 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1346 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1347 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1352 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1354 int extrawidth = (width % 4);
1355 vector unsigned char valigner = VEC_ALIGNER(srcp);
1356 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1357 width -= extrawidth;
1359 vector unsigned char voverflow;
1360 vector unsigned char vd;
1363 voverflow = (vector unsigned char)vec_ld(15, srcp);
1364 vs = vec_perm(vs, voverflow, valigner);
1365 vs = vec_perm(vs, valpha, vsrcPermute);
1368 vd = (vector unsigned char)vec_ld(0, dstp);
1369 vd = vec_perm(vd, vd, vsdstPermute);
1371 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1373 /* set the alpha channel to full on */
1374 vd = vec_or(vd, valphamask);
1375 vd = vec_perm(vd, vbits, vdstPermute);
1378 vec_st((vector unsigned int)vd, 0, dstp);
1385 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1387 #undef ONE_PIXEL_BLEND
1396 /* fast RGB888->(A)RGB888 blending */
1397 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1399 unsigned alpha = info->src->alpha;
1400 int height = info->d_height;
1401 Uint32 *srcp = (Uint32 *)info->s_pixels;
1402 int srcskip = info->s_skip >> 2;
1403 Uint32 *dstp = (Uint32 *)info->d_pixels;
1404 int dstskip = info->d_skip >> 2;
1405 vector unsigned char mergePermute;
1406 vector unsigned char valpha;
1407 vector unsigned char valphamask;
1408 vector unsigned short v1;
1409 vector unsigned short v8;
1411 mergePermute = VEC_MERGE_PERMUTE();
1412 v1 = vec_splat_u16(1);
1413 v8 = vec_splat_u16(8);
1415 /* set the alpha to 255 on the destination surf */
1416 valphamask = VEC_ALPHA_MASK();
1418 /* set a vector full of alpha and 255-alpha */
1419 ((unsigned char *)&valpha)[0] = alpha;
1420 valpha = vec_splat(valpha, 0);
1423 int width = info->d_width;
1424 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1427 Uint32 s1 = s & 0xff00ff; \
1428 Uint32 d1 = d & 0xff00ff; \
1429 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1433 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1434 *dstp = d1 | d | 0xff000000; \
1439 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1441 int extrawidth = (width % 4);
1442 vector unsigned char valigner = VEC_ALIGNER(srcp);
1443 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1444 width -= extrawidth;
1446 vector unsigned char voverflow;
1447 vector unsigned char vd;
1450 voverflow = (vector unsigned char)vec_ld(15, srcp);
1451 vs = vec_perm(vs, voverflow, valigner);
1454 vd = (vector unsigned char)vec_ld(0, dstp);
1456 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1458 /* set the alpha channel to full on */
1459 vd = vec_or(vd, valphamask);
1462 vec_st((vector unsigned int)vd, 0, dstp);
1469 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1471 #undef ONE_PIXEL_BLEND
1478 #pragma altivec_model off
1480 #endif /* SDL_ALTIVEC_BLITTERS */
1482 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1483 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1485 int width = info->d_width;
1486 int height = info->d_height;
1487 Uint32 *srcp = (Uint32 *)info->s_pixels;
1488 int srcskip = info->s_skip >> 2;
1489 Uint32 *dstp = (Uint32 *)info->d_pixels;
1490 int dstskip = info->d_skip >> 2;
1496 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1497 + (s & d & 0x00010101)) | 0xff000000;
1504 /* fast RGB888->(A)RGB888 blending with surface alpha */
1505 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1507 unsigned alpha = info->src->alpha;
1509 BlitRGBtoRGBSurfaceAlpha128(info);
1511 int width = info->d_width;
1512 int height = info->d_height;
1513 Uint32 *srcp = (Uint32 *)info->s_pixels;
1514 int srcskip = info->s_skip >> 2;
1515 Uint32 *dstp = (Uint32 *)info->d_pixels;
1516 int dstskip = info->d_skip >> 2;
1523 DUFFS_LOOP_DOUBLE2({
1524 /* One Pixel Blend */
1529 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1533 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1534 *dstp = d1 | d | 0xff000000;
1538 /* Two Pixels Blend */
1543 d1 += (s1 - d1) * alpha >> 8;
1546 s = ((s & 0xff00) >> 8) |
1547 ((srcp[1] & 0xff00) << 8);
1548 d = ((d & 0xff00) >> 8) |
1549 ((dstp[1] & 0xff00) << 8);
1550 d += (s - d) * alpha >> 8;
1553 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1560 d1 += (s1 - d1) * alpha >> 8;
1563 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1573 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1574 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1576 int width = info->d_width;
1577 int height = info->d_height;
1578 Uint32 *srcp = (Uint32 *)info->s_pixels;
1579 int srcskip = info->s_skip >> 2;
1580 Uint32 *dstp = (Uint32 *)info->d_pixels;
1581 int dstskip = info->d_skip >> 2;
1590 Uint32 alpha = s >> 24;
1591 /* FIXME: Here we special-case opaque alpha since the
1592 compositioning used (>>8 instead of /255) doesn't handle
1593 it correctly. Also special-case alpha=0 for speed?
1596 if(alpha == SDL_ALPHA_OPAQUE) {
1597 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1600 * take out the middle component (green), and process
1601 * the other two in parallel. One multiply less.
1604 dalpha = d & 0xff000000;
1607 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1610 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1611 *dstp = d1 | d | dalpha;
1623 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1624 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1626 int width = info->d_width;
1627 int height = info->d_height;
1628 Uint32 *srcp = (Uint32 *)info->s_pixels;
1629 int srcskip = info->s_skip >> 2;
1630 Uint32 *dstp = (Uint32 *)info->d_pixels;
1631 int dstskip = info->d_skip >> 2;
1632 SDL_PixelFormat* sf = info->src;
1633 Uint32 amask = sf->Amask;
1636 /* make mm6 all zeros. */
1637 "pxor %%mm6, %%mm6\n"
1639 /* Make a mask to preserve the alpha. */
1640 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1641 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1642 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1643 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1644 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1646 /* form channel masks */
1647 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1648 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1649 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1650 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1652 /* get alpha channel shift */
1653 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1655 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1665 : : "r" (srcp), "r" (dstp) );
1667 alpha = *srcp & amask;
1668 /* FIXME: Here we special-case opaque alpha since the
1669 compositioning used (>>8 instead of /255) doesn't handle
1670 it correctly. Also special-case alpha=0 for speed?
1675 else if(alpha == amask) {
1676 /* opaque alpha -- copy RGB, keep dst alpha */
1677 /* using MMX here to free up regular registers for other things */
1679 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1680 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1681 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1682 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1683 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1684 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1686 : : "r" (srcp), "r" (dstp) );
1691 /* load in the source, and dst. */
1692 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1693 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1695 /* Move the src alpha into mm2 */
1697 /* if supporting pshufw */
1698 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1699 /*"psrlw $8, %%mm2\n" */
1703 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1704 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1705 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1706 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1708 /* move the colors into words. */
1709 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1710 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1713 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1716 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1717 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1718 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1720 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1722 "movd %%mm0, (%1)\n" /* result in mm0 */
1724 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1738 /* End GCC_ASMBLIT*/
1741 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1742 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1744 int width = info->d_width;
1745 int height = info->d_height;
1746 Uint32 *srcp = (Uint32 *)info->s_pixels;
1747 int srcskip = info->s_skip >> 2;
1748 Uint32 *dstp = (Uint32 *)info->d_pixels;
1749 int dstskip = info->d_skip >> 2;
1750 SDL_PixelFormat* sf = info->src;
1751 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1752 Uint32 amask = sf->Amask;
1753 Uint32 ashift = sf->Ashift;
1756 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1758 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1759 multmask = ~(0xFFFFi64 << (ashift * 2));
1760 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1766 _m_prefetch(srcp + 16);
1767 _m_prefetch(dstp + 16);
1769 alpha = *srcp & amask;
1772 } else if (alpha == amask) {
1773 /* copy RGB, keep dst alpha */
1774 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1776 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1777 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1779 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1780 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1782 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1783 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1784 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1785 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1786 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1789 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1790 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1791 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1792 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1793 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1795 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1805 /* End MSVC_ASMBLIT */
1807 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1809 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1811 /* blend a single 16 bit pixel at 50% */
1812 #define BLEND16_50(d, s, mask) \
1813 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1815 /* blend two 16 bit pixels at 50% */
1816 #define BLEND2x16_50(d, s, mask) \
1817 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1818 + (s & d & (~(mask | mask << 16))))
1820 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1822 int width = info->d_width;
1823 int height = info->d_height;
1824 Uint16 *srcp = (Uint16 *)info->s_pixels;
1825 int srcskip = info->s_skip >> 1;
1826 Uint16 *dstp = (Uint16 *)info->d_pixels;
1827 int dstskip = info->d_skip >> 1;
1830 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1832 * Source and destination not aligned, pipeline it.
1833 * This is mostly a win for big blits but no loss for
1839 /* handle odd destination */
1840 if((uintptr_t)dstp & 2) {
1841 Uint16 d = *dstp, s = *srcp;
1842 *dstp = BLEND16_50(d, s, mask);
1847 srcp++; /* srcp is now 32-bit aligned */
1849 /* bootstrap pipeline with first halfword */
1850 prev_sw = ((Uint32 *)srcp)[-1];
1854 sw = *(Uint32 *)srcp;
1855 dw = *(Uint32 *)dstp;
1856 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1857 s = (prev_sw << 16) + (sw >> 16);
1859 s = (prev_sw >> 16) + (sw << 16);
1862 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1868 /* final pixel if any */
1870 Uint16 d = *dstp, s;
1871 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1872 s = (Uint16)prev_sw;
1874 s = (Uint16)(prev_sw >> 16);
1876 *dstp = BLEND16_50(d, s, mask);
1880 srcp += srcskip - 1;
1883 /* source and destination are aligned */
1886 /* first odd pixel? */
1887 if((uintptr_t)srcp & 2) {
1888 Uint16 d = *dstp, s = *srcp;
1889 *dstp = BLEND16_50(d, s, mask);
1894 /* srcp and dstp are now 32-bit aligned */
1897 Uint32 sw = *(Uint32 *)srcp;
1898 Uint32 dw = *(Uint32 *)dstp;
1899 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1905 /* last odd pixel? */
1907 Uint16 d = *dstp, s = *srcp;
1908 *dstp = BLEND16_50(d, s, mask);
1919 /* fast RGB565->RGB565 blending with surface alpha */
1920 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1922 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1924 Blit16to16SurfaceAlpha128(info, 0xf7de);
1926 int width = info->d_width;
1927 int height = info->d_height;
1928 Uint16 *srcp = (Uint16 *)info->s_pixels;
1929 int srcskip = info->s_skip >> 1;
1930 Uint16 *dstp = (Uint16 *)info->d_pixels;
1931 int dstskip = info->d_skip >> 1;
1935 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1937 alpha >>= 3; /* downscale alpha to 5 bits */
1939 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1940 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1941 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1942 /* position alpha to allow for mullo and mulhi on diff channels
1943 to reduce the number of operations */
1946 /* Setup the 565 color channel masks */
1947 load = 0x07E007E007E007E0ULL;
1948 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1949 load = 0x001F001F001F001FULL;
1950 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1957 * shift out the middle component (green) to
1958 * the high 16 bits, and process all three RGB
1959 * components at the same time.
1961 s = (s | s << 16) & 0x07e0f81f;
1962 d = (d | d << 16) & 0x07e0f81f;
1963 d += (s - d) * alpha >> 5;
1965 *dstp++ = d | d >> 16;
1970 * shift out the middle component (green) to
1971 * the high 16 bits, and process all three RGB
1972 * components at the same time.
1974 s = (s | s << 16) & 0x07e0f81f;
1975 d = (d | d << 16) & 0x07e0f81f;
1976 d += (s - d) * alpha >> 5;
1978 *dstp++ = d | d >> 16;
1982 * shift out the middle component (green) to
1983 * the high 16 bits, and process all three RGB
1984 * components at the same time.
1986 s = (s | s << 16) & 0x07e0f81f;
1987 d = (d | d << 16) & 0x07e0f81f;
1988 d += (s - d) * alpha >> 5;
1990 *dstp++ = d | d >> 16;
1992 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1993 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1995 /* red -- does not need a mask since the right shift clears
1996 the uninteresting bits */
1997 movq_r2r(mm2, mm5); /* src -> mm5 */
1998 movq_r2r(mm3, mm6); /* dst -> mm6 */
1999 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2000 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2003 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2004 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2005 /* alpha used is actually 11 bits
2006 11 + 5 = 16 bits, so the sign bits are lost */
2007 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2008 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2009 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2011 movq_r2r(mm6, mm1); /* save new reds in dsts */
2013 /* green -- process the bits in place */
2014 movq_r2r(mm2, mm5); /* src -> mm5 */
2015 movq_r2r(mm3, mm6); /* dst -> mm6 */
2016 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2017 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2020 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2021 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2022 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2023 bits are gone and the sign bits present */
2024 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2025 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2027 por_r2r(mm6, mm1); /* save new greens in dsts */
2030 movq_r2r(mm2, mm5); /* src -> mm5 */
2031 movq_r2r(mm3, mm6); /* dst -> mm6 */
2032 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2033 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2036 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2037 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2038 /* 11 + 5 = 16 bits, so the sign bits are lost and
2039 the interesting bits will need to be MASKed */
2040 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2041 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2042 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2044 por_r2r(mm6, mm1); /* save new blues in dsts */
2046 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2058 /* fast RGB555->RGB555 blending with surface alpha */
2059 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2061 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2063 Blit16to16SurfaceAlpha128(info, 0xfbde);
2065 int width = info->d_width;
2066 int height = info->d_height;
2067 Uint16 *srcp = (Uint16 *)info->s_pixels;
2068 int srcskip = info->s_skip >> 1;
2069 Uint16 *dstp = (Uint16 *)info->d_pixels;
2070 int dstskip = info->d_skip >> 1;
2074 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2076 alpha >>= 3; /* downscale alpha to 5 bits */
2078 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2079 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2080 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2081 /* position alpha to allow for mullo and mulhi on diff channels
2082 to reduce the number of operations */
2085 /* Setup the 555 color channel masks */
2086 load = 0x03E003E003E003E0ULL;
2087 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2088 load = 0x001F001F001F001FULL;
2089 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2096 * shift out the middle component (green) to
2097 * the high 16 bits, and process all three RGB
2098 * components at the same time.
2100 s = (s | s << 16) & 0x03e07c1f;
2101 d = (d | d << 16) & 0x03e07c1f;
2102 d += (s - d) * alpha >> 5;
2104 *dstp++ = d | d >> 16;
2109 * shift out the middle component (green) to
2110 * the high 16 bits, and process all three RGB
2111 * components at the same time.
2113 s = (s | s << 16) & 0x03e07c1f;
2114 d = (d | d << 16) & 0x03e07c1f;
2115 d += (s - d) * alpha >> 5;
2117 *dstp++ = d | d >> 16;
2121 * shift out the middle component (green) to
2122 * the high 16 bits, and process all three RGB
2123 * components at the same time.
2125 s = (s | s << 16) & 0x03e07c1f;
2126 d = (d | d << 16) & 0x03e07c1f;
2127 d += (s - d) * alpha >> 5;
2129 *dstp++ = d | d >> 16;
2131 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2132 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2134 /* red -- process the bits in place */
2135 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2136 /* by reusing the GREEN mask we free up another mmx
2137 register to accumulate the result */
2139 movq_r2r(mm2, mm5); /* src -> mm5 */
2140 movq_r2r(mm3, mm6); /* dst -> mm6 */
2141 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2142 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2145 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2146 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2147 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2148 cleared by a MASK below */
2149 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2150 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2151 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2153 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2155 movq_r2r(mm6, mm1); /* save new reds in dsts */
2157 /* green -- process the bits in place */
2158 movq_r2r(mm2, mm5); /* src -> mm5 */
2159 movq_r2r(mm3, mm6); /* dst -> mm6 */
2160 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2161 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2164 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2165 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2166 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2167 bits are gone and the sign bits present */
2168 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2169 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2171 por_r2r(mm6, mm1); /* save new greens in dsts */
2174 movq_r2r(mm2, mm5); /* src -> mm5 */
2175 movq_r2r(mm3, mm6); /* dst -> mm6 */
2176 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2177 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2180 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2181 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2182 /* 11 + 5 = 16 bits, so the sign bits are lost and
2183 the interesting bits will need to be MASKed */
2184 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2185 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2186 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2188 por_r2r(mm6, mm1); /* save new blues in dsts */
2190 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2201 /* End GCC_ASMBLIT */
2204 /* fast RGB565->RGB565 blending with surface alpha */
2205 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2207 unsigned alpha = info->src->alpha;
2209 Blit16to16SurfaceAlpha128(info, 0xf7de);
2211 int width = info->d_width;
2212 int height = info->d_height;
2213 Uint16 *srcp = (Uint16 *)info->s_pixels;
2214 int srcskip = info->s_skip >> 1;
2215 Uint16 *dstp = (Uint16 *)info->d_pixels;
2216 int dstskip = info->d_skip >> 1;
2219 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2221 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2222 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2223 alpha >>= 3; /* downscale alpha to 5 bits */
2225 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2226 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2227 /* position alpha to allow for mullo and mulhi on diff channels
2228 to reduce the number of operations */
2229 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2231 /* Setup the 565 color channel masks */
2232 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2233 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2241 * shift out the middle component (green) to
2242 * the high 16 bits, and process all three RGB
2243 * components at the same time.
2245 s = (s | s << 16) & 0x07e0f81f;
2246 d = (d | d << 16) & 0x07e0f81f;
2247 d += (s - d) * alpha >> 5;
2249 *dstp++ = (Uint16)(d | d >> 16);
2254 * shift out the middle component (green) to
2255 * the high 16 bits, and process all three RGB
2256 * components at the same time.
2258 s = (s | s << 16) & 0x07e0f81f;
2259 d = (d | d << 16) & 0x07e0f81f;
2260 d += (s - d) * alpha >> 5;
2262 *dstp++ = (Uint16)(d | d >> 16);
2266 * shift out the middle component (green) to
2267 * the high 16 bits, and process all three RGB
2268 * components at the same time.
2270 s = (s | s << 16) & 0x07e0f81f;
2271 d = (d | d << 16) & 0x07e0f81f;
2272 d += (s - d) * alpha >> 5;
2274 *dstp++ = (Uint16)(d | d >> 16);
2276 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2277 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2281 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2284 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2287 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2288 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2289 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2290 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2291 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2293 mm_res = dst2; /* RED -> mm_res */
2295 /* green -- process the bits in place */
2297 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2300 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2303 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2304 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2305 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2306 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2308 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2312 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2315 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2318 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2319 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2320 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2321 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2322 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2324 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2326 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2338 /* fast RGB555->RGB555 blending with surface alpha */
2339 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2341 unsigned alpha = info->src->alpha;
2343 Blit16to16SurfaceAlpha128(info, 0xfbde);
2345 int width = info->d_width;
2346 int height = info->d_height;
2347 Uint16 *srcp = (Uint16 *)info->s_pixels;
2348 int srcskip = info->s_skip >> 1;
2349 Uint16 *dstp = (Uint16 *)info->d_pixels;
2350 int dstskip = info->d_skip >> 1;
2353 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2355 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2356 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2357 alpha >>= 3; /* downscale alpha to 5 bits */
2359 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2360 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2361 /* position alpha to allow for mullo and mulhi on diff channels
2362 to reduce the number of operations */
2363 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2365 /* Setup the 555 color channel masks */
2366 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2367 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2368 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2376 * shift out the middle component (green) to
2377 * the high 16 bits, and process all three RGB
2378 * components at the same time.
2380 s = (s | s << 16) & 0x03e07c1f;
2381 d = (d | d << 16) & 0x03e07c1f;
2382 d += (s - d) * alpha >> 5;
2384 *dstp++ = (Uint16)(d | d >> 16);
2389 * shift out the middle component (green) to
2390 * the high 16 bits, and process all three RGB
2391 * components at the same time.
2393 s = (s | s << 16) & 0x03e07c1f;
2394 d = (d | d << 16) & 0x03e07c1f;
2395 d += (s - d) * alpha >> 5;
2397 *dstp++ = (Uint16)(d | d >> 16);
2401 * shift out the middle component (green) to
2402 * the high 16 bits, and process all three RGB
2403 * components at the same time.
2405 s = (s | s << 16) & 0x03e07c1f;
2406 d = (d | d << 16) & 0x03e07c1f;
2407 d += (s - d) * alpha >> 5;
2409 *dstp++ = (Uint16)(d | d >> 16);
2411 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2412 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2414 /* red -- process the bits in place */
2416 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2419 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2422 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2423 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2424 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2425 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2426 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2428 mm_res = dst2; /* RED -> mm_res */
2430 /* green -- process the bits in place */
2432 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2435 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2438 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2439 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2440 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2441 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2443 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2446 src2 = src1; /* src -> src2 */
2447 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2449 dst2 = dst1; /* dst -> dst2 */
2450 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2453 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2454 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2455 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2456 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2457 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2459 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2461 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2472 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2474 /* fast RGB565->RGB565 blending with surface alpha */
2475 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2477 unsigned alpha = info->src->alpha;
2479 Blit16to16SurfaceAlpha128(info, 0xf7de);
2481 int width = info->d_width;
2482 int height = info->d_height;
2483 Uint16 *srcp = (Uint16 *)info->s_pixels;
2484 int srcskip = info->s_skip >> 1;
2485 Uint16 *dstp = (Uint16 *)info->d_pixels;
2486 int dstskip = info->d_skip >> 1;
2487 alpha >>= 3; /* downscale alpha to 5 bits */
2494 * shift out the middle component (green) to
2495 * the high 16 bits, and process all three RGB
2496 * components at the same time.
2498 s = (s | s << 16) & 0x07e0f81f;
2499 d = (d | d << 16) & 0x07e0f81f;
2500 d += (s - d) * alpha >> 5;
2502 *dstp++ = (Uint16)(d | d >> 16);
2510 /* fast RGB555->RGB555 blending with surface alpha */
2511 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2513 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2515 Blit16to16SurfaceAlpha128(info, 0xfbde);
2517 int width = info->d_width;
2518 int height = info->d_height;
2519 Uint16 *srcp = (Uint16 *)info->s_pixels;
2520 int srcskip = info->s_skip >> 1;
2521 Uint16 *dstp = (Uint16 *)info->d_pixels;
2522 int dstskip = info->d_skip >> 1;
2523 alpha >>= 3; /* downscale alpha to 5 bits */
2530 * shift out the middle component (green) to
2531 * the high 16 bits, and process all three RGB
2532 * components at the same time.
2534 s = (s | s << 16) & 0x03e07c1f;
2535 d = (d | d << 16) & 0x03e07c1f;
2536 d += (s - d) * alpha >> 5;
2538 *dstp++ = (Uint16)(d | d >> 16);
2546 /* fast ARGB8888->RGB565 blending with pixel alpha */
2547 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2549 int width = info->d_width;
2550 int height = info->d_height;
2551 Uint32 *srcp = (Uint32 *)info->s_pixels;
2552 int srcskip = info->s_skip >> 2;
2553 Uint16 *dstp = (Uint16 *)info->d_pixels;
2554 int dstskip = info->d_skip >> 1;
2559 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2560 /* FIXME: Here we special-case opaque alpha since the
2561 compositioning used (>>8 instead of /255) doesn't handle
2562 it correctly. Also special-case alpha=0 for speed?
2565 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2566 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2570 * convert source and destination to G0RAB65565
2571 * and blend all components at the same time
2573 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2575 d = (d | d << 16) & 0x07e0f81f;
2576 d += (s - d) * alpha >> 5;
2578 *dstp = (Uint16)(d | d >> 16);
2589 /* fast ARGB8888->RGB555 blending with pixel alpha */
2590 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2592 int width = info->d_width;
2593 int height = info->d_height;
2594 Uint32 *srcp = (Uint32 *)info->s_pixels;
2595 int srcskip = info->s_skip >> 2;
2596 Uint16 *dstp = (Uint16 *)info->d_pixels;
2597 int dstskip = info->d_skip >> 1;
2603 alpha = s >> 27; /* downscale alpha to 5 bits */
2604 /* FIXME: Here we special-case opaque alpha since the
2605 compositioning used (>>8 instead of /255) doesn't handle
2606 it correctly. Also special-case alpha=0 for speed?
2609 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2610 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2614 * convert source and destination to G0RAB65565
2615 * and blend all components at the same time
2617 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2619 d = (d | d << 16) & 0x03e07c1f;
2620 d += (s - d) * alpha >> 5;
2622 *dstp = (Uint16)(d | d >> 16);
2633 /* General (slow) N->N blending with per-surface alpha */
2634 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2636 int width = info->d_width;
2637 int height = info->d_height;
2638 Uint8 *src = info->s_pixels;
2639 int srcskip = info->s_skip;
2640 Uint8 *dst = info->d_pixels;
2641 int dstskip = info->d_skip;
2642 SDL_PixelFormat *srcfmt = info->src;
2643 SDL_PixelFormat *dstfmt = info->dst;
2644 int srcbpp = srcfmt->BytesPerPixel;
2645 int dstbpp = dstfmt->BytesPerPixel;
2646 unsigned sA = srcfmt->alpha;
2647 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2650 while ( height-- ) {
2660 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2661 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2662 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2663 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2674 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2675 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2677 int width = info->d_width;
2678 int height = info->d_height;
2679 Uint8 *src = info->s_pixels;
2680 int srcskip = info->s_skip;
2681 Uint8 *dst = info->d_pixels;
2682 int dstskip = info->d_skip;
2683 SDL_PixelFormat *srcfmt = info->src;
2684 SDL_PixelFormat *dstfmt = info->dst;
2685 Uint32 ckey = srcfmt->colorkey;
2686 int srcbpp = srcfmt->BytesPerPixel;
2687 int dstbpp = dstfmt->BytesPerPixel;
2688 unsigned sA = srcfmt->alpha;
2689 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2691 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2692 Uint16 *src16 = (Uint16 *)src;
2693 Uint16 *dst16 = (Uint16 *)dst;
2694 sA >>= 3; /* downscale alpha to 5 bits */
2695 while ( height-- ) {
2701 if(sA && s != ckey) {
2703 s = (s | s << 16) & 0x07e0f81f;
2704 d = (d | d << 16) & 0x07e0f81f;
2705 d += (s - d) * sA >> 5;
2707 *dst16 = (Uint16)(d | d >> 16);
2713 src16 += srcskip / 2;
2714 dst16 += dstskip / 2;
2719 while ( height-- ) {
2729 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2730 if(sA && Pixel != ckey) {
2731 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2732 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2733 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2734 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2745 /* General (slow) N->N blending with pixel alpha */
2746 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2748 int width = info->d_width;
2749 int height = info->d_height;
2750 Uint8 *src = info->s_pixels;
2751 int srcskip = info->s_skip;
2752 Uint8 *dst = info->d_pixels;
2753 int dstskip = info->d_skip;
2754 SDL_PixelFormat *srcfmt = info->src;
2755 SDL_PixelFormat *dstfmt = info->dst;
2760 /* Set up some basic variables */
2761 srcbpp = srcfmt->BytesPerPixel;
2762 dstbpp = dstfmt->BytesPerPixel;
2764 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2765 quite right. for <8bpp source alpha, it gets them very wrong
2767 It is unclear whether there is a good general solution that doesn't
2768 need a branch (or a divide). */
2769 while ( height-- ) {
2781 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2783 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2784 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2785 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2797 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2799 SDL_PixelFormat *sf = surface->format;
2800 SDL_PixelFormat *df = surface->map->dst->format;
2802 if(sf->Amask == 0) {
2803 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2804 if(df->BytesPerPixel == 1)
2805 return BlitNto1SurfaceAlphaKey;
2807 #if SDL_ALTIVEC_BLITTERS
2808 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2809 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2810 return Blit32to32SurfaceAlphaKeyAltivec;
2813 return BlitNtoNSurfaceAlphaKey;
2815 /* Per-surface alpha blits */
2816 switch(df->BytesPerPixel) {
2818 return BlitNto1SurfaceAlpha;
2821 if(surface->map->identity) {
2822 if(df->Gmask == 0x7e0)
2826 return Blit565to565SurfaceAlphaMMX;
2829 return Blit565to565SurfaceAlpha;
2831 else if(df->Gmask == 0x3e0)
2835 return Blit555to555SurfaceAlphaMMX;
2838 return Blit555to555SurfaceAlpha;
2841 return BlitNtoNSurfaceAlpha;
2844 if(sf->Rmask == df->Rmask
2845 && sf->Gmask == df->Gmask
2846 && sf->Bmask == df->Bmask
2847 && sf->BytesPerPixel == 4)
2850 if(sf->Rshift % 8 == 0
2851 && sf->Gshift % 8 == 0
2852 && sf->Bshift % 8 == 0
2854 return BlitRGBtoRGBSurfaceAlphaMMX;
2857 if(sf->Rshift % 8 == 0
2858 && sf->Gshift % 8 == 0
2859 && sf->Bshift % 8 == 0)
2861 return BlitARGBtoXRGBalphaS_neon;
2864 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2866 #if SDL_ALTIVEC_BLITTERS
2867 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2868 && SDL_HasAltiVec())
2869 return BlitRGBtoRGBSurfaceAlphaAltivec;
2871 return BlitRGBtoRGBSurfaceAlpha;
2875 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2876 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2878 return BlitABGRtoXRGBalphaS_neon;
2881 #if SDL_ALTIVEC_BLITTERS
2882 if((sf->BytesPerPixel == 4) &&
2883 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2884 return Blit32to32SurfaceAlphaAltivec;
2887 return BlitNtoNSurfaceAlpha;
2891 return BlitNtoNSurfaceAlpha;
2895 /* Per-pixel alpha blits */
2896 switch(df->BytesPerPixel) {
2898 return BlitNto1PixelAlpha;
2901 #if SDL_ALTIVEC_BLITTERS
2902 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2903 df->Gmask == 0x7e0 &&
2904 df->Bmask == 0x1f && SDL_HasAltiVec())
2905 return Blit32to565PixelAlphaAltivec;
2908 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2909 && sf->Gmask == 0xff00
2910 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2911 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2912 if(df->Gmask == 0x7e0)
2913 return BlitARGBto565PixelAlpha;
2914 else if(df->Gmask == 0x3e0)
2915 return BlitARGBto555PixelAlpha;
2917 return BlitNtoNPixelAlpha;
2920 if(sf->Rmask == df->Rmask
2921 && sf->Gmask == df->Gmask
2922 && sf->Bmask == df->Bmask
2923 && sf->BytesPerPixel == 4)
2926 if(sf->Rshift % 8 == 0
2927 && sf->Gshift % 8 == 0
2928 && sf->Bshift % 8 == 0
2929 && sf->Ashift % 8 == 0
2933 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2935 return BlitRGBtoRGBPixelAlphaMMX;
2939 if(sf->Rshift % 8 == 0
2940 && sf->Gshift % 8 == 0
2941 && sf->Bshift % 8 == 0
2942 && sf->Ashift % 8 == 0)
2944 return BlitARGBtoXRGBalpha_neon;
2947 if(sf->Amask == 0xff000000)
2949 #if SDL_ALTIVEC_BLITTERS
2950 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2951 && SDL_HasAltiVec())
2952 return BlitRGBtoRGBPixelAlphaAltivec;
2954 return BlitRGBtoRGBPixelAlpha;
2958 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2959 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2960 && sf->Amask == 0xff000000)
2962 return BlitABGRtoXRGBalpha_neon;
2965 #if SDL_ALTIVEC_BLITTERS
2966 if (sf->Amask && sf->BytesPerPixel == 4 &&
2967 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2968 return Blit32to32PixelAlphaAltivec;
2971 return BlitNtoNPixelAlpha;
2975 return BlitNtoNPixelAlpha;