2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "SDL_config.h"
24 #include "SDL_video.h"
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
51 #endif /* SDL_ASSEMBLY_ROUTINES */
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
62 /* Functions to perform alpha blended blitting */
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int srcskip = info->s_skip; \
76 int dstskip = info->d_skip; \
78 while ( height-- ) { \
79 neon_name(dst, src, width); \
80 src += width * 4 + srcskip; \
81 dst += width * 4 + dstskip; \
85 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
86 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
88 #endif /* __ARM_NEON__ */
90 /* N->1 blending with per-surface alpha */
91 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
93 int width = info->d_width;
94 int height = info->d_height;
95 Uint8 *src = info->s_pixels;
96 int srcskip = info->s_skip;
97 Uint8 *dst = info->d_pixels;
98 int dstskip = info->d_skip;
99 Uint8 *palmap = info->table;
100 SDL_PixelFormat *srcfmt = info->src;
101 SDL_PixelFormat *dstfmt = info->dst;
102 int srcbpp = srcfmt->BytesPerPixel;
104 const unsigned A = srcfmt->alpha;
116 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
117 dR = dstfmt->palette->colors[*dst].r;
118 dG = dstfmt->palette->colors[*dst].g;
119 dB = dstfmt->palette->colors[*dst].b;
120 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
124 /* Pack RGB into 8bit pixel */
125 if ( palmap == NULL ) {
126 *dst =((dR>>5)<<(3+2))|
130 *dst = palmap[((dR>>5)<<(3+2))|
143 /* N->1 blending with pixel alpha */
144 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
146 int width = info->d_width;
147 int height = info->d_height;
148 Uint8 *src = info->s_pixels;
149 int srcskip = info->s_skip;
150 Uint8 *dst = info->d_pixels;
151 int dstskip = info->d_skip;
152 Uint8 *palmap = info->table;
153 SDL_PixelFormat *srcfmt = info->src;
154 SDL_PixelFormat *dstfmt = info->dst;
155 int srcbpp = srcfmt->BytesPerPixel;
157 /* FIXME: fix alpha bit field expansion here too? */
169 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
170 dR = dstfmt->palette->colors[*dst].r;
171 dG = dstfmt->palette->colors[*dst].g;
172 dB = dstfmt->palette->colors[*dst].b;
173 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
177 /* Pack RGB into 8bit pixel */
178 if ( palmap == NULL ) {
179 *dst =((dR>>5)<<(3+2))|
183 *dst = palmap[((dR>>5)<<(3+2))|
196 /* colorkeyed N->1 blending with per-surface alpha */
197 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
199 int width = info->d_width;
200 int height = info->d_height;
201 Uint8 *src = info->s_pixels;
202 int srcskip = info->s_skip;
203 Uint8 *dst = info->d_pixels;
204 int dstskip = info->d_skip;
205 Uint8 *palmap = info->table;
206 SDL_PixelFormat *srcfmt = info->src;
207 SDL_PixelFormat *dstfmt = info->dst;
208 int srcbpp = srcfmt->BytesPerPixel;
209 Uint32 ckey = srcfmt->colorkey;
211 const int A = srcfmt->alpha;
223 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
224 if ( Pixel != ckey ) {
225 dR = dstfmt->palette->colors[*dst].r;
226 dG = dstfmt->palette->colors[*dst].g;
227 dB = dstfmt->palette->colors[*dst].b;
228 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
232 /* Pack RGB into 8bit pixel */
233 if ( palmap == NULL ) {
234 *dst =((dR>>5)<<(3+2))|
238 *dst = palmap[((dR>>5)<<(3+2))|
253 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
254 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
256 int width = info->d_width;
257 int height = info->d_height;
258 Uint32 *srcp = (Uint32 *)info->s_pixels;
259 int srcskip = info->s_skip >> 2;
260 Uint32 *dstp = (Uint32 *)info->d_pixels;
261 int dstskip = info->d_skip >> 2;
262 Uint32 dalpha = info->dst->Amask;
265 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
266 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
267 load = 0x0001010100010101ULL;/* !alpha128 mask */
268 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
269 movd_m2r(dalpha, mm7); /* dst alpha mask */
270 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
276 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
277 + (s & d & 0x00010101)) | dalpha;
279 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
280 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
282 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
283 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
285 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
286 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
287 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
288 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
289 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
290 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
291 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
293 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
294 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
304 /* fast RGB888->(A)RGB888 blending with surface alpha */
305 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
307 SDL_PixelFormat* df = info->dst;
308 unsigned alpha = info->src->alpha;
310 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
311 /* only call a128 version when R,G,B occupy lower bits */
312 BlitRGBtoRGBSurfaceAlpha128MMX(info);
314 int width = info->d_width;
315 int height = info->d_height;
316 Uint32 *srcp = (Uint32 *)info->s_pixels;
317 int srcskip = info->s_skip >> 2;
318 Uint32 *dstp = (Uint32 *)info->d_pixels;
319 int dstskip = info->d_skip >> 2;
321 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
322 /* form the alpha mult */
323 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
324 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
325 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
326 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
327 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
328 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
329 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
330 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
331 movd_m2r(df->Amask, mm7); /* dst alpha mask */
332 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
336 /* One Pixel Blend */
337 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
338 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
339 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
340 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
342 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
343 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
344 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
345 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
347 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
348 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
349 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
353 /* Two Pixels Blend */
354 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
355 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
356 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
357 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
359 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
360 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
361 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
362 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
364 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
365 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
366 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
367 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
369 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
370 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
371 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
372 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
374 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
375 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
377 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
389 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
390 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
392 int width = info->d_width;
393 int height = info->d_height;
394 Uint32 *srcp = (Uint32 *)info->s_pixels;
395 int srcskip = info->s_skip >> 2;
396 Uint32 *dstp = (Uint32 *)info->d_pixels;
397 int dstskip = info->d_skip >> 2;
398 SDL_PixelFormat* sf = info->src;
399 Uint32 amask = sf->Amask;
401 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
402 /* form multiplication mask */
403 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
404 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
405 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
406 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
407 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
408 /* form channel masks */
409 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
410 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
411 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
412 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
413 /* get alpha channel shift */
414 __asm__ __volatile__ (
416 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
420 Uint32 alpha = *srcp & amask;
421 /* FIXME: Here we special-case opaque alpha since the
422 compositioning used (>>8 instead of /255) doesn't handle
423 it correctly. Also special-case alpha=0 for speed?
427 } else if(alpha == amask) {
428 /* opaque alpha -- copy RGB, keep dst alpha */
429 /* using MMX here to free up regular registers for other things */
430 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
431 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
432 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
433 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
434 por_r2r(mm1, mm2); /* src | dst -> mm2 */
435 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
437 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
438 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
440 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
441 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
443 __asm__ __volatile__ (
445 : : "r" (alpha) ); /* 0000A000 -> mm4 */
446 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
447 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
448 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
449 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
452 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
453 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
454 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
455 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
457 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
458 movd_r2m(mm2, *dstp);/* mm2 -> dst */
468 /* End GCC_ASMBLIT */
471 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
472 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
474 int width = info->d_width;
475 int height = info->d_height;
476 Uint32 *srcp = (Uint32 *)info->s_pixels;
477 int srcskip = info->s_skip >> 2;
478 Uint32 *dstp = (Uint32 *)info->d_pixels;
479 int dstskip = info->d_skip >> 2;
480 Uint32 dalpha = info->dst->Amask;
482 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
484 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
485 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
486 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
493 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
494 + (s & d & 0x00010101)) | dalpha;
498 for (n >>= 1; n > 0; --n) {
499 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
500 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
502 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
503 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
505 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
506 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
507 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
508 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
510 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
511 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
512 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
513 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
515 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
526 /* fast RGB888->(A)RGB888 blending with surface alpha */
527 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
529 SDL_PixelFormat* df = info->dst;
530 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
531 unsigned alpha = info->src->alpha;
533 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
534 /* only call a128 version when R,G,B occupy lower bits */
535 BlitRGBtoRGBSurfaceAlpha128MMX(info);
537 int width = info->d_width;
538 int height = info->d_height;
539 Uint32 *srcp = (Uint32 *)info->s_pixels;
540 int srcskip = info->s_skip >> 2;
541 Uint32 *dstp = (Uint32 *)info->d_pixels;
542 int dstskip = info->d_skip >> 2;
543 Uint32 dalpha = df->Amask;
546 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
548 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
549 /* form the alpha mult */
550 amult = alpha | (alpha << 8);
551 amult = amult | (amult << 16);
552 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
553 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
554 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
555 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
556 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
561 /* One Pixel Blend */
562 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
563 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
565 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
566 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
568 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
569 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
570 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
571 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
573 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
574 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
575 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
583 for (n >>= 1; n > 0; --n) {
584 /* Two Pixels Blend */
585 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
586 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
587 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
588 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
590 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
591 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
592 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
593 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
595 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
596 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
597 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
598 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
600 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
601 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
602 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
603 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
605 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
606 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
608 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
620 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
621 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
623 int width = info->d_width;
624 int height = info->d_height;
625 Uint32 *srcp = (Uint32 *)info->s_pixels;
626 int srcskip = info->s_skip >> 2;
627 Uint32 *dstp = (Uint32 *)info->d_pixels;
628 int dstskip = info->d_skip >> 2;
629 SDL_PixelFormat* sf = info->src;
630 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
631 Uint32 amask = sf->Amask;
632 Uint32 ashift = sf->Ashift;
635 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
637 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
638 multmask = ~(0xFFFFi64 << (ashift * 2));
639 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
643 Uint32 alpha = *srcp & amask;
646 } else if (alpha == amask) {
647 /* opaque alpha -- copy RGB, keep dst alpha */
648 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
650 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
651 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
653 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
654 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
656 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
657 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
658 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
659 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
660 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
663 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
664 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
665 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
666 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
667 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
669 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
679 /* End MSVC_ASMBLIT */
681 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
683 #if SDL_ALTIVEC_BLITTERS
685 #pragma altivec_model on
692 #if (defined(__MACOSX__) && (__GNUC__ < 4))
693 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
694 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
695 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
696 (vector unsigned short) ( a,b,c,d,e,f,g,h )
698 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
699 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
700 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
701 (vector unsigned short) { a,b,c,d,e,f,g,h }
704 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
705 #define VECPRINT(msg, v) do { \
706 vector unsigned int tmpvec = (vector unsigned int)(v); \
707 unsigned int *vp = (unsigned int *)&tmpvec; \
708 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
711 /* the permuation vector that takes the high bytes out of all the appropriate shorts
712 (vector unsigned char)(
713 0x00, 0x10, 0x02, 0x12,
714 0x04, 0x14, 0x06, 0x16,
715 0x08, 0x18, 0x0A, 0x1A,
716 0x0C, 0x1C, 0x0E, 0x1E );
718 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
719 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
720 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
721 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
723 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
726 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
727 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
728 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
729 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
730 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
731 /* valpha2 is 255-alpha */ \
732 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
733 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
734 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
735 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
736 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
737 /* add source and dest */ \
738 vtemp1 = vec_add(vtemp1, vtemp3); \
739 vtemp2 = vec_add(vtemp2, vtemp4); \
740 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
741 vtemp1 = vec_add(vtemp1, v1_16); \
742 vtemp3 = vec_sr(vtemp1, v8_16); \
743 vtemp1 = vec_add(vtemp1, vtemp3); \
744 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
745 vtemp2 = vec_add(vtemp2, v1_16); \
746 vtemp4 = vec_sr(vtemp2, v8_16); \
747 vtemp2 = vec_add(vtemp2, vtemp4); \
748 /* (>>8) and get ARGBARGBARGBARGB */ \
749 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
752 /* Calculate the permute vector used for 32->32 swizzling */
753 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
754 const SDL_PixelFormat *dstfmt)
757 * We have to assume that the bits that aren't used by other
758 * colors is alpha, and it's one complete byte, since some formats
759 * leave alpha with a zero mask, but we should still swizzle the bits.
762 const static struct SDL_PixelFormat default_pixel_format = {
766 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
769 srcfmt = &default_pixel_format;
772 dstfmt = &default_pixel_format;
774 const vector unsigned char plus = VECUINT8_LITERAL
775 ( 0x00, 0x00, 0x00, 0x00,
776 0x04, 0x04, 0x04, 0x04,
777 0x08, 0x08, 0x08, 0x08,
778 0x0C, 0x0C, 0x0C, 0x0C );
779 vector unsigned char vswiz;
780 vector unsigned int srcvec;
781 #define RESHIFT(X) (3 - ((X) >> 3))
782 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
783 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
784 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
786 /* Use zero for alpha if either surface doesn't have alpha */
788 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
790 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
793 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
794 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
798 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
800 int height = info->d_height;
801 Uint8 *src = (Uint8 *)info->s_pixels;
802 int srcskip = info->s_skip;
803 Uint8 *dst = (Uint8 *)info->d_pixels;
804 int dstskip = info->d_skip;
805 SDL_PixelFormat *srcfmt = info->src;
807 vector unsigned char v0 = vec_splat_u8(0);
808 vector unsigned short v8_16 = vec_splat_u16(8);
809 vector unsigned short v1_16 = vec_splat_u16(1);
810 vector unsigned short v2_16 = vec_splat_u16(2);
811 vector unsigned short v3_16 = vec_splat_u16(3);
812 vector unsigned int v8_32 = vec_splat_u32(8);
813 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
814 vector unsigned short v3f = VECUINT16_LITERAL(
815 0x003f, 0x003f, 0x003f, 0x003f,
816 0x003f, 0x003f, 0x003f, 0x003f);
817 vector unsigned short vfc = VECUINT16_LITERAL(
818 0x00fc, 0x00fc, 0x00fc, 0x00fc,
819 0x00fc, 0x00fc, 0x00fc, 0x00fc);
822 0x10 - 0x1f is the alpha
823 0x00 - 0x0e evens are the red
824 0x01 - 0x0f odds are zero
826 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
827 0x10, 0x00, 0x01, 0x01,
828 0x10, 0x02, 0x01, 0x01,
829 0x10, 0x04, 0x01, 0x01,
830 0x10, 0x06, 0x01, 0x01
832 vector unsigned char vredalpha2 = (vector unsigned char)(
833 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
836 0x00 - 0x0f is ARxx ARxx ARxx ARxx
837 0x11 - 0x0f odds are blue
839 vector unsigned char vblue1 = VECUINT8_LITERAL(
840 0x00, 0x01, 0x02, 0x11,
841 0x04, 0x05, 0x06, 0x13,
842 0x08, 0x09, 0x0a, 0x15,
843 0x0c, 0x0d, 0x0e, 0x17
845 vector unsigned char vblue2 = (vector unsigned char)(
846 vec_add((vector unsigned int)vblue1, v8_32)
849 0x00 - 0x0f is ARxB ARxB ARxB ARxB
850 0x10 - 0x0e evens are green
852 vector unsigned char vgreen1 = VECUINT8_LITERAL(
853 0x00, 0x01, 0x10, 0x03,
854 0x04, 0x05, 0x12, 0x07,
855 0x08, 0x09, 0x14, 0x0b,
856 0x0c, 0x0d, 0x16, 0x0f
858 vector unsigned char vgreen2 = (vector unsigned char)(
859 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
861 vector unsigned char vgmerge = VECUINT8_LITERAL(
862 0x00, 0x02, 0x00, 0x06,
863 0x00, 0x0a, 0x00, 0x0e,
864 0x00, 0x12, 0x00, 0x16,
865 0x00, 0x1a, 0x00, 0x1e);
866 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
867 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
868 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
870 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
871 vf800 = vec_sl(vf800, vec_splat_u16(8));
875 vector unsigned char valigner;
876 vector unsigned char vsrc;
877 vector unsigned char voverflow;
878 int width = info->d_width;
880 #define ONE_PIXEL_BLEND(condition, widthvar) \
881 while (condition) { \
883 unsigned sR, sG, sB, dR, dG, dB, sA; \
884 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
886 unsigned short dstpixel = *((unsigned short *)dst); \
887 dR = (dstpixel >> 8) & 0xf8; \
888 dG = (dstpixel >> 3) & 0xfc; \
889 dB = (dstpixel << 3) & 0xf8; \
890 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
891 *((unsigned short *)dst) = ( \
892 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
899 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
900 extrawidth = (width % 8);
901 valigner = VEC_ALIGNER(src);
902 vsrc = (vector unsigned char)vec_ld(0, src);
905 vector unsigned char valpha;
906 vector unsigned char vsrc1, vsrc2;
907 vector unsigned char vdst1, vdst2;
908 vector unsigned short vR, vG, vB;
909 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
911 /* Load 8 pixels from src as ARGB */
912 voverflow = (vector unsigned char)vec_ld(15, src);
913 vsrc = vec_perm(vsrc, voverflow, valigner);
914 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
916 vsrc = (vector unsigned char)vec_ld(15, src);
917 voverflow = vec_perm(voverflow, vsrc, valigner);
918 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
921 /* Load 8 pixels from dst as XRGB */
922 voverflow = vec_ld(0, dst);
923 vR = vec_and((vector unsigned short)voverflow, vf800);
924 vB = vec_sl((vector unsigned short)voverflow, v3_16);
925 vG = vec_sl(vB, v2_16);
926 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
927 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
928 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
929 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
930 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
931 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
933 /* Alpha blend 8 pixels as ARGB */
934 valpha = vec_perm(vsrc1, v0, valphaPermute);
935 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
936 valpha = vec_perm(vsrc2, v0, valphaPermute);
937 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
939 /* Convert 8 pixels to 565 */
940 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
941 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
942 vgpixel = vec_and(vgpixel, vfc);
943 vgpixel = vec_sl(vgpixel, v3_16);
944 vrpixel = vec_sl(vpixel, v1_16);
945 vrpixel = vec_and(vrpixel, vf800);
946 vbpixel = vec_and(vpixel, v3f);
947 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
948 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
951 vec_st(vdst1, 0, dst);
956 ONE_PIXEL_BLEND((extrawidth), extrawidth);
957 #undef ONE_PIXEL_BLEND
963 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
965 unsigned alpha = info->src->alpha;
966 int height = info->d_height;
967 Uint32 *srcp = (Uint32 *)info->s_pixels;
968 int srcskip = info->s_skip >> 2;
969 Uint32 *dstp = (Uint32 *)info->d_pixels;
970 int dstskip = info->d_skip >> 2;
971 SDL_PixelFormat *srcfmt = info->src;
972 SDL_PixelFormat *dstfmt = info->dst;
973 unsigned sA = srcfmt->alpha;
974 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
975 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
976 Uint32 ckey = info->src->colorkey;
977 vector unsigned char mergePermute;
978 vector unsigned char vsrcPermute;
979 vector unsigned char vdstPermute;
980 vector unsigned char vsdstPermute;
981 vector unsigned char valpha;
982 vector unsigned char valphamask;
983 vector unsigned char vbits;
984 vector unsigned char v0;
985 vector unsigned short v1;
986 vector unsigned short v8;
987 vector unsigned int vckey;
988 vector unsigned int vrgbmask;
990 mergePermute = VEC_MERGE_PERMUTE();
991 v0 = vec_splat_u8(0);
992 v1 = vec_splat_u16(1);
993 v8 = vec_splat_u16(8);
995 /* set the alpha to 255 on the destination surf */
996 valphamask = VEC_ALPHA_MASK();
998 vsrcPermute = calc_swizzle32(srcfmt, NULL);
999 vdstPermute = calc_swizzle32(NULL, dstfmt);
1000 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1002 /* set a vector full of alpha and 255-alpha */
1003 ((unsigned char *)&valpha)[0] = alpha;
1004 valpha = vec_splat(valpha, 0);
1005 vbits = (vector unsigned char)vec_splat_s8(-1);
1008 ((unsigned int *)(char*)&vckey)[0] = ckey;
1009 vckey = vec_splat(vckey, 0);
1010 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1011 vrgbmask = vec_splat(vrgbmask, 0);
1014 int width = info->d_width;
1015 #define ONE_PIXEL_BLEND(condition, widthvar) \
1016 while (condition) { \
1018 unsigned sR, sG, sB, dR, dG, dB; \
1019 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1020 if(sA && Pixel != ckey) { \
1021 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1022 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1023 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1024 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1030 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1032 int extrawidth = (width % 4);
1033 vector unsigned char valigner = VEC_ALIGNER(srcp);
1034 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1035 width -= extrawidth;
1037 vector unsigned char vsel;
1038 vector unsigned char voverflow;
1039 vector unsigned char vd;
1040 vector unsigned char vd_orig;
1043 voverflow = (vector unsigned char)vec_ld(15, srcp);
1044 vs = vec_perm(vs, voverflow, valigner);
1046 /* vsel is set for items that match the key */
1047 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1048 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1050 /* permute to source format */
1051 vs = vec_perm(vs, valpha, vsrcPermute);
1054 vd = (vector unsigned char)vec_ld(0, dstp);
1055 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1057 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1059 /* set the alpha channel to full on */
1060 vd = vec_or(vd, valphamask);
1062 /* mask out color key */
1063 vd = vec_sel(vd, vd_orig, vsel);
1065 /* permute to dest format */
1066 vd = vec_perm(vd, vbits, vdstPermute);
1069 vec_st((vector unsigned int)vd, 0, dstp);
1076 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1078 #undef ONE_PIXEL_BLEND
1086 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1088 int width = info->d_width;
1089 int height = info->d_height;
1090 Uint32 *srcp = (Uint32 *)info->s_pixels;
1091 int srcskip = info->s_skip >> 2;
1092 Uint32 *dstp = (Uint32 *)info->d_pixels;
1093 int dstskip = info->d_skip >> 2;
1094 SDL_PixelFormat *srcfmt = info->src;
1095 SDL_PixelFormat *dstfmt = info->dst;
1096 vector unsigned char mergePermute;
1097 vector unsigned char valphaPermute;
1098 vector unsigned char vsrcPermute;
1099 vector unsigned char vdstPermute;
1100 vector unsigned char vsdstPermute;
1101 vector unsigned char valphamask;
1102 vector unsigned char vpixelmask;
1103 vector unsigned char v0;
1104 vector unsigned short v1;
1105 vector unsigned short v8;
1107 v0 = vec_splat_u8(0);
1108 v1 = vec_splat_u16(1);
1109 v8 = vec_splat_u16(8);
1110 mergePermute = VEC_MERGE_PERMUTE();
1111 valphamask = VEC_ALPHA_MASK();
1112 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1113 vpixelmask = vec_nor(valphamask, v0);
1114 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1115 vdstPermute = calc_swizzle32(NULL, dstfmt);
1116 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1118 while ( height-- ) {
1119 width = info->d_width;
1120 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1122 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1123 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1125 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1126 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1127 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1133 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1137 int extrawidth = (width % 4);
1138 vector unsigned char valigner = VEC_ALIGNER(srcp);
1139 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1140 width -= extrawidth;
1142 vector unsigned char voverflow;
1143 vector unsigned char vd;
1144 vector unsigned char valpha;
1145 vector unsigned char vdstalpha;
1147 voverflow = (vector unsigned char)vec_ld(15, srcp);
1148 vs = vec_perm(vs, voverflow, valigner);
1149 vs = vec_perm(vs, v0, vsrcPermute);
1151 valpha = vec_perm(vs, v0, valphaPermute);
1154 vd = (vector unsigned char)vec_ld(0, dstp);
1155 vd = vec_perm(vd, v0, vsdstPermute);
1156 vdstalpha = vec_and(vd, valphamask);
1158 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1160 /* set the alpha to the dest alpha */
1161 vd = vec_and(vd, vpixelmask);
1162 vd = vec_or(vd, vdstalpha);
1163 vd = vec_perm(vd, v0, vdstPermute);
1166 vec_st((vector unsigned int)vd, 0, dstp);
1174 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1178 #undef ONE_PIXEL_BLEND
1182 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1183 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1185 int width = info->d_width;
1186 int height = info->d_height;
1187 Uint32 *srcp = (Uint32 *)info->s_pixels;
1188 int srcskip = info->s_skip >> 2;
1189 Uint32 *dstp = (Uint32 *)info->d_pixels;
1190 int dstskip = info->d_skip >> 2;
1191 vector unsigned char mergePermute;
1192 vector unsigned char valphaPermute;
1193 vector unsigned char valphamask;
1194 vector unsigned char vpixelmask;
1195 vector unsigned char v0;
1196 vector unsigned short v1;
1197 vector unsigned short v8;
1198 v0 = vec_splat_u8(0);
1199 v1 = vec_splat_u16(1);
1200 v8 = vec_splat_u16(8);
1201 mergePermute = VEC_MERGE_PERMUTE();
1202 valphamask = VEC_ALPHA_MASK();
1203 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1206 vpixelmask = vec_nor(valphamask, v0);
1208 width = info->d_width;
1209 #define ONE_PIXEL_BLEND(condition, widthvar) \
1210 while ((condition)) { \
1216 Uint32 alpha = s >> 24; \
1218 if(alpha == SDL_ALPHA_OPAQUE) { \
1219 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1222 dalpha = d & 0xff000000; \
1223 s1 = s & 0xff00ff; \
1224 d1 = d & 0xff00ff; \
1225 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1228 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1229 *dstp = d1 | d | dalpha; \
1236 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1238 int extrawidth = (width % 4);
1239 vector unsigned char valigner = VEC_ALIGNER(srcp);
1240 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1241 width -= extrawidth;
1243 vector unsigned char voverflow;
1244 vector unsigned char vd;
1245 vector unsigned char valpha;
1246 vector unsigned char vdstalpha;
1248 voverflow = (vector unsigned char)vec_ld(15, srcp);
1249 vs = vec_perm(vs, voverflow, valigner);
1251 valpha = vec_perm(vs, v0, valphaPermute);
1254 vd = (vector unsigned char)vec_ld(0, dstp);
1255 vdstalpha = vec_and(vd, valphamask);
1257 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1259 /* set the alpha to the dest alpha */
1260 vd = vec_and(vd, vpixelmask);
1261 vd = vec_or(vd, vdstalpha);
1264 vec_st((vector unsigned int)vd, 0, dstp);
1271 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1276 #undef ONE_PIXEL_BLEND
1279 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1282 unsigned alpha = info->src->alpha;
1283 int height = info->d_height;
1284 Uint32 *srcp = (Uint32 *)info->s_pixels;
1285 int srcskip = info->s_skip >> 2;
1286 Uint32 *dstp = (Uint32 *)info->d_pixels;
1287 int dstskip = info->d_skip >> 2;
1288 SDL_PixelFormat *srcfmt = info->src;
1289 SDL_PixelFormat *dstfmt = info->dst;
1290 unsigned sA = srcfmt->alpha;
1291 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1292 vector unsigned char mergePermute;
1293 vector unsigned char vsrcPermute;
1294 vector unsigned char vdstPermute;
1295 vector unsigned char vsdstPermute;
1296 vector unsigned char valpha;
1297 vector unsigned char valphamask;
1298 vector unsigned char vbits;
1299 vector unsigned short v1;
1300 vector unsigned short v8;
1302 mergePermute = VEC_MERGE_PERMUTE();
1303 v1 = vec_splat_u16(1);
1304 v8 = vec_splat_u16(8);
1306 /* set the alpha to 255 on the destination surf */
1307 valphamask = VEC_ALPHA_MASK();
1309 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1310 vdstPermute = calc_swizzle32(NULL, dstfmt);
1311 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1313 /* set a vector full of alpha and 255-alpha */
1314 ((unsigned char *)&valpha)[0] = alpha;
1315 valpha = vec_splat(valpha, 0);
1316 vbits = (vector unsigned char)vec_splat_s8(-1);
1319 int width = info->d_width;
1320 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1322 unsigned sR, sG, sB, dR, dG, dB; \
1323 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1324 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1325 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1326 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1331 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1333 int extrawidth = (width % 4);
1334 vector unsigned char valigner = VEC_ALIGNER(srcp);
1335 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1336 width -= extrawidth;
1338 vector unsigned char voverflow;
1339 vector unsigned char vd;
1342 voverflow = (vector unsigned char)vec_ld(15, srcp);
1343 vs = vec_perm(vs, voverflow, valigner);
1344 vs = vec_perm(vs, valpha, vsrcPermute);
1347 vd = (vector unsigned char)vec_ld(0, dstp);
1348 vd = vec_perm(vd, vd, vsdstPermute);
1350 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1352 /* set the alpha channel to full on */
1353 vd = vec_or(vd, valphamask);
1354 vd = vec_perm(vd, vbits, vdstPermute);
1357 vec_st((vector unsigned int)vd, 0, dstp);
1364 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1366 #undef ONE_PIXEL_BLEND
1375 /* fast RGB888->(A)RGB888 blending */
1376 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1378 unsigned alpha = info->src->alpha;
1379 int height = info->d_height;
1380 Uint32 *srcp = (Uint32 *)info->s_pixels;
1381 int srcskip = info->s_skip >> 2;
1382 Uint32 *dstp = (Uint32 *)info->d_pixels;
1383 int dstskip = info->d_skip >> 2;
1384 vector unsigned char mergePermute;
1385 vector unsigned char valpha;
1386 vector unsigned char valphamask;
1387 vector unsigned short v1;
1388 vector unsigned short v8;
1390 mergePermute = VEC_MERGE_PERMUTE();
1391 v1 = vec_splat_u16(1);
1392 v8 = vec_splat_u16(8);
1394 /* set the alpha to 255 on the destination surf */
1395 valphamask = VEC_ALPHA_MASK();
1397 /* set a vector full of alpha and 255-alpha */
1398 ((unsigned char *)&valpha)[0] = alpha;
1399 valpha = vec_splat(valpha, 0);
1402 int width = info->d_width;
1403 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1406 Uint32 s1 = s & 0xff00ff; \
1407 Uint32 d1 = d & 0xff00ff; \
1408 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1412 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1413 *dstp = d1 | d | 0xff000000; \
1418 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1420 int extrawidth = (width % 4);
1421 vector unsigned char valigner = VEC_ALIGNER(srcp);
1422 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1423 width -= extrawidth;
1425 vector unsigned char voverflow;
1426 vector unsigned char vd;
1429 voverflow = (vector unsigned char)vec_ld(15, srcp);
1430 vs = vec_perm(vs, voverflow, valigner);
1433 vd = (vector unsigned char)vec_ld(0, dstp);
1435 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1437 /* set the alpha channel to full on */
1438 vd = vec_or(vd, valphamask);
1441 vec_st((vector unsigned int)vd, 0, dstp);
1448 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1450 #undef ONE_PIXEL_BLEND
1457 #pragma altivec_model off
1459 #endif /* SDL_ALTIVEC_BLITTERS */
1461 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1462 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1464 int width = info->d_width;
1465 int height = info->d_height;
1466 Uint32 *srcp = (Uint32 *)info->s_pixels;
1467 int srcskip = info->s_skip >> 2;
1468 Uint32 *dstp = (Uint32 *)info->d_pixels;
1469 int dstskip = info->d_skip >> 2;
1475 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1476 + (s & d & 0x00010101)) | 0xff000000;
1483 /* fast RGB888->(A)RGB888 blending with surface alpha */
1484 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1486 unsigned alpha = info->src->alpha;
1488 BlitRGBtoRGBSurfaceAlpha128(info);
1490 int width = info->d_width;
1491 int height = info->d_height;
1492 Uint32 *srcp = (Uint32 *)info->s_pixels;
1493 int srcskip = info->s_skip >> 2;
1494 Uint32 *dstp = (Uint32 *)info->d_pixels;
1495 int dstskip = info->d_skip >> 2;
1502 DUFFS_LOOP_DOUBLE2({
1503 /* One Pixel Blend */
1508 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1512 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1513 *dstp = d1 | d | 0xff000000;
1517 /* Two Pixels Blend */
1522 d1 += (s1 - d1) * alpha >> 8;
1525 s = ((s & 0xff00) >> 8) |
1526 ((srcp[1] & 0xff00) << 8);
1527 d = ((d & 0xff00) >> 8) |
1528 ((dstp[1] & 0xff00) << 8);
1529 d += (s - d) * alpha >> 8;
1532 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1539 d1 += (s1 - d1) * alpha >> 8;
1542 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1552 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1553 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1555 int width = info->d_width;
1556 int height = info->d_height;
1557 Uint32 *srcp = (Uint32 *)info->s_pixels;
1558 int srcskip = info->s_skip >> 2;
1559 Uint32 *dstp = (Uint32 *)info->d_pixels;
1560 int dstskip = info->d_skip >> 2;
1569 Uint32 alpha = s >> 24;
1570 /* FIXME: Here we special-case opaque alpha since the
1571 compositioning used (>>8 instead of /255) doesn't handle
1572 it correctly. Also special-case alpha=0 for speed?
1575 if(alpha == SDL_ALPHA_OPAQUE) {
1576 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1579 * take out the middle component (green), and process
1580 * the other two in parallel. One multiply less.
1583 dalpha = d & 0xff000000;
1586 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1589 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1590 *dstp = d1 | d | dalpha;
1602 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1603 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1605 int width = info->d_width;
1606 int height = info->d_height;
1607 Uint32 *srcp = (Uint32 *)info->s_pixels;
1608 int srcskip = info->s_skip >> 2;
1609 Uint32 *dstp = (Uint32 *)info->d_pixels;
1610 int dstskip = info->d_skip >> 2;
1611 SDL_PixelFormat* sf = info->src;
1612 Uint32 amask = sf->Amask;
1615 /* make mm6 all zeros. */
1616 "pxor %%mm6, %%mm6\n"
1618 /* Make a mask to preserve the alpha. */
1619 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1620 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1621 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1622 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1623 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1625 /* form channel masks */
1626 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1627 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1628 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1629 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1631 /* get alpha channel shift */
1632 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1634 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1644 : : "r" (srcp), "r" (dstp) );
1646 alpha = *srcp & amask;
1647 /* FIXME: Here we special-case opaque alpha since the
1648 compositioning used (>>8 instead of /255) doesn't handle
1649 it correctly. Also special-case alpha=0 for speed?
1654 else if(alpha == amask) {
1655 /* opaque alpha -- copy RGB, keep dst alpha */
1656 /* using MMX here to free up regular registers for other things */
1658 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1659 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1660 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1661 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1662 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1663 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1665 : : "r" (srcp), "r" (dstp) );
1670 /* load in the source, and dst. */
1671 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1672 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1674 /* Move the src alpha into mm2 */
1676 /* if supporting pshufw */
1677 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1678 /*"psrlw $8, %%mm2\n" */
1682 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1683 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1684 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1685 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1687 /* move the colors into words. */
1688 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1689 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1692 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1695 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1696 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1697 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1699 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1701 "movd %%mm0, (%1)\n" /* result in mm0 */
1703 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1717 /* End GCC_ASMBLIT*/
1720 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1721 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1723 int width = info->d_width;
1724 int height = info->d_height;
1725 Uint32 *srcp = (Uint32 *)info->s_pixels;
1726 int srcskip = info->s_skip >> 2;
1727 Uint32 *dstp = (Uint32 *)info->d_pixels;
1728 int dstskip = info->d_skip >> 2;
1729 SDL_PixelFormat* sf = info->src;
1730 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1731 Uint32 amask = sf->Amask;
1732 Uint32 ashift = sf->Ashift;
1735 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1737 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1738 multmask = ~(0xFFFFi64 << (ashift * 2));
1739 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1745 _m_prefetch(srcp + 16);
1746 _m_prefetch(dstp + 16);
1748 alpha = *srcp & amask;
1751 } else if (alpha == amask) {
1752 /* copy RGB, keep dst alpha */
1753 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1755 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1756 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1758 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1759 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1761 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1762 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1763 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1764 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1765 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1768 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1769 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1770 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1771 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1772 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1774 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1784 /* End MSVC_ASMBLIT */
1786 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1788 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1790 /* blend a single 16 bit pixel at 50% */
1791 #define BLEND16_50(d, s, mask) \
1792 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1794 /* blend two 16 bit pixels at 50% */
1795 #define BLEND2x16_50(d, s, mask) \
1796 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1797 + (s & d & (~(mask | mask << 16))))
1799 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1801 int width = info->d_width;
1802 int height = info->d_height;
1803 Uint16 *srcp = (Uint16 *)info->s_pixels;
1804 int srcskip = info->s_skip >> 1;
1805 Uint16 *dstp = (Uint16 *)info->d_pixels;
1806 int dstskip = info->d_skip >> 1;
1809 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1811 * Source and destination not aligned, pipeline it.
1812 * This is mostly a win for big blits but no loss for
1818 /* handle odd destination */
1819 if((uintptr_t)dstp & 2) {
1820 Uint16 d = *dstp, s = *srcp;
1821 *dstp = BLEND16_50(d, s, mask);
1826 srcp++; /* srcp is now 32-bit aligned */
1828 /* bootstrap pipeline with first halfword */
1829 prev_sw = ((Uint32 *)srcp)[-1];
1833 sw = *(Uint32 *)srcp;
1834 dw = *(Uint32 *)dstp;
1835 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1836 s = (prev_sw << 16) + (sw >> 16);
1838 s = (prev_sw >> 16) + (sw << 16);
1841 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1847 /* final pixel if any */
1849 Uint16 d = *dstp, s;
1850 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1851 s = (Uint16)prev_sw;
1853 s = (Uint16)(prev_sw >> 16);
1855 *dstp = BLEND16_50(d, s, mask);
1859 srcp += srcskip - 1;
1862 /* source and destination are aligned */
1865 /* first odd pixel? */
1866 if((uintptr_t)srcp & 2) {
1867 Uint16 d = *dstp, s = *srcp;
1868 *dstp = BLEND16_50(d, s, mask);
1873 /* srcp and dstp are now 32-bit aligned */
1876 Uint32 sw = *(Uint32 *)srcp;
1877 Uint32 dw = *(Uint32 *)dstp;
1878 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1884 /* last odd pixel? */
1886 Uint16 d = *dstp, s = *srcp;
1887 *dstp = BLEND16_50(d, s, mask);
1898 /* fast RGB565->RGB565 blending with surface alpha */
1899 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1901 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1903 Blit16to16SurfaceAlpha128(info, 0xf7de);
1905 int width = info->d_width;
1906 int height = info->d_height;
1907 Uint16 *srcp = (Uint16 *)info->s_pixels;
1908 int srcskip = info->s_skip >> 1;
1909 Uint16 *dstp = (Uint16 *)info->d_pixels;
1910 int dstskip = info->d_skip >> 1;
1914 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1916 alpha >>= 3; /* downscale alpha to 5 bits */
1918 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1919 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1920 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1921 /* position alpha to allow for mullo and mulhi on diff channels
1922 to reduce the number of operations */
1925 /* Setup the 565 color channel masks */
1926 load = 0x07E007E007E007E0ULL;
1927 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1928 load = 0x001F001F001F001FULL;
1929 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1936 * shift out the middle component (green) to
1937 * the high 16 bits, and process all three RGB
1938 * components at the same time.
1940 s = (s | s << 16) & 0x07e0f81f;
1941 d = (d | d << 16) & 0x07e0f81f;
1942 d += (s - d) * alpha >> 5;
1944 *dstp++ = d | d >> 16;
1949 * shift out the middle component (green) to
1950 * the high 16 bits, and process all three RGB
1951 * components at the same time.
1953 s = (s | s << 16) & 0x07e0f81f;
1954 d = (d | d << 16) & 0x07e0f81f;
1955 d += (s - d) * alpha >> 5;
1957 *dstp++ = d | d >> 16;
1961 * shift out the middle component (green) to
1962 * the high 16 bits, and process all three RGB
1963 * components at the same time.
1965 s = (s | s << 16) & 0x07e0f81f;
1966 d = (d | d << 16) & 0x07e0f81f;
1967 d += (s - d) * alpha >> 5;
1969 *dstp++ = d | d >> 16;
1971 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1972 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1974 /* red -- does not need a mask since the right shift clears
1975 the uninteresting bits */
1976 movq_r2r(mm2, mm5); /* src -> mm5 */
1977 movq_r2r(mm3, mm6); /* dst -> mm6 */
1978 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1979 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1982 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1983 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1984 /* alpha used is actually 11 bits
1985 11 + 5 = 16 bits, so the sign bits are lost */
1986 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1987 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1988 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1990 movq_r2r(mm6, mm1); /* save new reds in dsts */
1992 /* green -- process the bits in place */
1993 movq_r2r(mm2, mm5); /* src -> mm5 */
1994 movq_r2r(mm3, mm6); /* dst -> mm6 */
1995 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1996 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1999 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2000 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2001 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2002 bits are gone and the sign bits present */
2003 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2004 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2006 por_r2r(mm6, mm1); /* save new greens in dsts */
2009 movq_r2r(mm2, mm5); /* src -> mm5 */
2010 movq_r2r(mm3, mm6); /* dst -> mm6 */
2011 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2012 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2015 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2016 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2017 /* 11 + 5 = 16 bits, so the sign bits are lost and
2018 the interesting bits will need to be MASKed */
2019 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2020 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2021 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2023 por_r2r(mm6, mm1); /* save new blues in dsts */
2025 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2037 /* fast RGB555->RGB555 blending with surface alpha */
2038 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2040 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2042 Blit16to16SurfaceAlpha128(info, 0xfbde);
2044 int width = info->d_width;
2045 int height = info->d_height;
2046 Uint16 *srcp = (Uint16 *)info->s_pixels;
2047 int srcskip = info->s_skip >> 1;
2048 Uint16 *dstp = (Uint16 *)info->d_pixels;
2049 int dstskip = info->d_skip >> 1;
2053 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2055 alpha >>= 3; /* downscale alpha to 5 bits */
2057 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2058 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2059 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2060 /* position alpha to allow for mullo and mulhi on diff channels
2061 to reduce the number of operations */
2064 /* Setup the 555 color channel masks */
2065 load = 0x03E003E003E003E0ULL;
2066 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2067 load = 0x001F001F001F001FULL;
2068 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2075 * shift out the middle component (green) to
2076 * the high 16 bits, and process all three RGB
2077 * components at the same time.
2079 s = (s | s << 16) & 0x03e07c1f;
2080 d = (d | d << 16) & 0x03e07c1f;
2081 d += (s - d) * alpha >> 5;
2083 *dstp++ = d | d >> 16;
2088 * shift out the middle component (green) to
2089 * the high 16 bits, and process all three RGB
2090 * components at the same time.
2092 s = (s | s << 16) & 0x03e07c1f;
2093 d = (d | d << 16) & 0x03e07c1f;
2094 d += (s - d) * alpha >> 5;
2096 *dstp++ = d | d >> 16;
2100 * shift out the middle component (green) to
2101 * the high 16 bits, and process all three RGB
2102 * components at the same time.
2104 s = (s | s << 16) & 0x03e07c1f;
2105 d = (d | d << 16) & 0x03e07c1f;
2106 d += (s - d) * alpha >> 5;
2108 *dstp++ = d | d >> 16;
2110 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2111 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2113 /* red -- process the bits in place */
2114 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2115 /* by reusing the GREEN mask we free up another mmx
2116 register to accumulate the result */
2118 movq_r2r(mm2, mm5); /* src -> mm5 */
2119 movq_r2r(mm3, mm6); /* dst -> mm6 */
2120 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2121 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2124 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2125 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2126 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2127 cleared by a MASK below */
2128 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2129 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2130 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2132 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2134 movq_r2r(mm6, mm1); /* save new reds in dsts */
2136 /* green -- process the bits in place */
2137 movq_r2r(mm2, mm5); /* src -> mm5 */
2138 movq_r2r(mm3, mm6); /* dst -> mm6 */
2139 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2140 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2143 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2144 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2145 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2146 bits are gone and the sign bits present */
2147 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2148 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2150 por_r2r(mm6, mm1); /* save new greens in dsts */
2153 movq_r2r(mm2, mm5); /* src -> mm5 */
2154 movq_r2r(mm3, mm6); /* dst -> mm6 */
2155 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2156 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2159 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2160 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2161 /* 11 + 5 = 16 bits, so the sign bits are lost and
2162 the interesting bits will need to be MASKed */
2163 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2164 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2165 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2167 por_r2r(mm6, mm1); /* save new blues in dsts */
2169 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2180 /* End GCC_ASMBLIT */
2183 /* fast RGB565->RGB565 blending with surface alpha */
2184 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2186 unsigned alpha = info->src->alpha;
2188 Blit16to16SurfaceAlpha128(info, 0xf7de);
2190 int width = info->d_width;
2191 int height = info->d_height;
2192 Uint16 *srcp = (Uint16 *)info->s_pixels;
2193 int srcskip = info->s_skip >> 1;
2194 Uint16 *dstp = (Uint16 *)info->d_pixels;
2195 int dstskip = info->d_skip >> 1;
2198 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2200 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2201 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2202 alpha >>= 3; /* downscale alpha to 5 bits */
2204 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2205 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2206 /* position alpha to allow for mullo and mulhi on diff channels
2207 to reduce the number of operations */
2208 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2210 /* Setup the 565 color channel masks */
2211 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2212 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2220 * shift out the middle component (green) to
2221 * the high 16 bits, and process all three RGB
2222 * components at the same time.
2224 s = (s | s << 16) & 0x07e0f81f;
2225 d = (d | d << 16) & 0x07e0f81f;
2226 d += (s - d) * alpha >> 5;
2228 *dstp++ = (Uint16)(d | d >> 16);
2233 * shift out the middle component (green) to
2234 * the high 16 bits, and process all three RGB
2235 * components at the same time.
2237 s = (s | s << 16) & 0x07e0f81f;
2238 d = (d | d << 16) & 0x07e0f81f;
2239 d += (s - d) * alpha >> 5;
2241 *dstp++ = (Uint16)(d | d >> 16);
2245 * shift out the middle component (green) to
2246 * the high 16 bits, and process all three RGB
2247 * components at the same time.
2249 s = (s | s << 16) & 0x07e0f81f;
2250 d = (d | d << 16) & 0x07e0f81f;
2251 d += (s - d) * alpha >> 5;
2253 *dstp++ = (Uint16)(d | d >> 16);
2255 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2256 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2260 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2263 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2266 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2267 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2268 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2269 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2270 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2272 mm_res = dst2; /* RED -> mm_res */
2274 /* green -- process the bits in place */
2276 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2279 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2282 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2283 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2284 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2285 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2287 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2291 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2294 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2297 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2298 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2299 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2300 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2301 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2303 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2305 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2317 /* fast RGB555->RGB555 blending with surface alpha */
2318 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2320 unsigned alpha = info->src->alpha;
2322 Blit16to16SurfaceAlpha128(info, 0xfbde);
2324 int width = info->d_width;
2325 int height = info->d_height;
2326 Uint16 *srcp = (Uint16 *)info->s_pixels;
2327 int srcskip = info->s_skip >> 1;
2328 Uint16 *dstp = (Uint16 *)info->d_pixels;
2329 int dstskip = info->d_skip >> 1;
2332 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2334 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2335 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2336 alpha >>= 3; /* downscale alpha to 5 bits */
2338 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2339 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2340 /* position alpha to allow for mullo and mulhi on diff channels
2341 to reduce the number of operations */
2342 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2344 /* Setup the 555 color channel masks */
2345 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2346 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2347 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2355 * shift out the middle component (green) to
2356 * the high 16 bits, and process all three RGB
2357 * components at the same time.
2359 s = (s | s << 16) & 0x03e07c1f;
2360 d = (d | d << 16) & 0x03e07c1f;
2361 d += (s - d) * alpha >> 5;
2363 *dstp++ = (Uint16)(d | d >> 16);
2368 * shift out the middle component (green) to
2369 * the high 16 bits, and process all three RGB
2370 * components at the same time.
2372 s = (s | s << 16) & 0x03e07c1f;
2373 d = (d | d << 16) & 0x03e07c1f;
2374 d += (s - d) * alpha >> 5;
2376 *dstp++ = (Uint16)(d | d >> 16);
2380 * shift out the middle component (green) to
2381 * the high 16 bits, and process all three RGB
2382 * components at the same time.
2384 s = (s | s << 16) & 0x03e07c1f;
2385 d = (d | d << 16) & 0x03e07c1f;
2386 d += (s - d) * alpha >> 5;
2388 *dstp++ = (Uint16)(d | d >> 16);
2390 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2391 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2393 /* red -- process the bits in place */
2395 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2398 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2401 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2402 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2403 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2404 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2405 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2407 mm_res = dst2; /* RED -> mm_res */
2409 /* green -- process the bits in place */
2411 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2414 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2417 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2418 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2419 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2420 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2422 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2425 src2 = src1; /* src -> src2 */
2426 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2428 dst2 = dst1; /* dst -> dst2 */
2429 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2432 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2433 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2434 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2435 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2436 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2438 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2440 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2451 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2453 /* fast RGB565->RGB565 blending with surface alpha */
2454 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2456 unsigned alpha = info->src->alpha;
2458 Blit16to16SurfaceAlpha128(info, 0xf7de);
2460 int width = info->d_width;
2461 int height = info->d_height;
2462 Uint16 *srcp = (Uint16 *)info->s_pixels;
2463 int srcskip = info->s_skip >> 1;
2464 Uint16 *dstp = (Uint16 *)info->d_pixels;
2465 int dstskip = info->d_skip >> 1;
2466 alpha >>= 3; /* downscale alpha to 5 bits */
2473 * shift out the middle component (green) to
2474 * the high 16 bits, and process all three RGB
2475 * components at the same time.
2477 s = (s | s << 16) & 0x07e0f81f;
2478 d = (d | d << 16) & 0x07e0f81f;
2479 d += (s - d) * alpha >> 5;
2481 *dstp++ = (Uint16)(d | d >> 16);
2489 /* fast RGB555->RGB555 blending with surface alpha */
2490 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2492 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2494 Blit16to16SurfaceAlpha128(info, 0xfbde);
2496 int width = info->d_width;
2497 int height = info->d_height;
2498 Uint16 *srcp = (Uint16 *)info->s_pixels;
2499 int srcskip = info->s_skip >> 1;
2500 Uint16 *dstp = (Uint16 *)info->d_pixels;
2501 int dstskip = info->d_skip >> 1;
2502 alpha >>= 3; /* downscale alpha to 5 bits */
2509 * shift out the middle component (green) to
2510 * the high 16 bits, and process all three RGB
2511 * components at the same time.
2513 s = (s | s << 16) & 0x03e07c1f;
2514 d = (d | d << 16) & 0x03e07c1f;
2515 d += (s - d) * alpha >> 5;
2517 *dstp++ = (Uint16)(d | d >> 16);
2525 /* fast ARGB8888->RGB565 blending with pixel alpha */
2526 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2528 int width = info->d_width;
2529 int height = info->d_height;
2530 Uint32 *srcp = (Uint32 *)info->s_pixels;
2531 int srcskip = info->s_skip >> 2;
2532 Uint16 *dstp = (Uint16 *)info->d_pixels;
2533 int dstskip = info->d_skip >> 1;
2538 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2539 /* FIXME: Here we special-case opaque alpha since the
2540 compositioning used (>>8 instead of /255) doesn't handle
2541 it correctly. Also special-case alpha=0 for speed?
2544 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2545 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2549 * convert source and destination to G0RAB65565
2550 * and blend all components at the same time
2552 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2554 d = (d | d << 16) & 0x07e0f81f;
2555 d += (s - d) * alpha >> 5;
2557 *dstp = (Uint16)(d | d >> 16);
2568 /* fast ARGB8888->RGB555 blending with pixel alpha */
2569 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2571 int width = info->d_width;
2572 int height = info->d_height;
2573 Uint32 *srcp = (Uint32 *)info->s_pixels;
2574 int srcskip = info->s_skip >> 2;
2575 Uint16 *dstp = (Uint16 *)info->d_pixels;
2576 int dstskip = info->d_skip >> 1;
2582 alpha = s >> 27; /* downscale alpha to 5 bits */
2583 /* FIXME: Here we special-case opaque alpha since the
2584 compositioning used (>>8 instead of /255) doesn't handle
2585 it correctly. Also special-case alpha=0 for speed?
2588 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2589 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2593 * convert source and destination to G0RAB65565
2594 * and blend all components at the same time
2596 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2598 d = (d | d << 16) & 0x03e07c1f;
2599 d += (s - d) * alpha >> 5;
2601 *dstp = (Uint16)(d | d >> 16);
2612 /* General (slow) N->N blending with per-surface alpha */
2613 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2615 int width = info->d_width;
2616 int height = info->d_height;
2617 Uint8 *src = info->s_pixels;
2618 int srcskip = info->s_skip;
2619 Uint8 *dst = info->d_pixels;
2620 int dstskip = info->d_skip;
2621 SDL_PixelFormat *srcfmt = info->src;
2622 SDL_PixelFormat *dstfmt = info->dst;
2623 int srcbpp = srcfmt->BytesPerPixel;
2624 int dstbpp = dstfmt->BytesPerPixel;
2625 unsigned sA = srcfmt->alpha;
2626 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2629 while ( height-- ) {
2639 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2640 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2641 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2642 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2653 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2654 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2656 int width = info->d_width;
2657 int height = info->d_height;
2658 Uint8 *src = info->s_pixels;
2659 int srcskip = info->s_skip;
2660 Uint8 *dst = info->d_pixels;
2661 int dstskip = info->d_skip;
2662 SDL_PixelFormat *srcfmt = info->src;
2663 SDL_PixelFormat *dstfmt = info->dst;
2664 Uint32 ckey = srcfmt->colorkey;
2665 int srcbpp = srcfmt->BytesPerPixel;
2666 int dstbpp = dstfmt->BytesPerPixel;
2667 unsigned sA = srcfmt->alpha;
2668 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2670 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2671 Uint16 *src16 = (Uint16 *)src;
2672 Uint16 *dst16 = (Uint16 *)dst;
2673 sA >>= 3; /* downscale alpha to 5 bits */
2674 while ( height-- ) {
2680 if(sA && s != ckey) {
2682 s = (s | s << 16) & 0x07e0f81f;
2683 d = (d | d << 16) & 0x07e0f81f;
2684 d += (s - d) * sA >> 5;
2686 *dst16 = (Uint16)(d | d >> 16);
2692 src16 += srcskip / 2;
2693 dst16 += dstskip / 2;
2698 while ( height-- ) {
2708 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2709 if(sA && Pixel != ckey) {
2710 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2711 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2712 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2713 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2724 /* General (slow) N->N blending with pixel alpha */
2725 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2727 int width = info->d_width;
2728 int height = info->d_height;
2729 Uint8 *src = info->s_pixels;
2730 int srcskip = info->s_skip;
2731 Uint8 *dst = info->d_pixels;
2732 int dstskip = info->d_skip;
2733 SDL_PixelFormat *srcfmt = info->src;
2734 SDL_PixelFormat *dstfmt = info->dst;
2739 /* Set up some basic variables */
2740 srcbpp = srcfmt->BytesPerPixel;
2741 dstbpp = dstfmt->BytesPerPixel;
2743 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2744 quite right. for <8bpp source alpha, it gets them very wrong
2746 It is unclear whether there is a good general solution that doesn't
2747 need a branch (or a divide). */
2748 while ( height-- ) {
2760 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2762 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2763 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2764 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2776 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2778 SDL_PixelFormat *sf = surface->format;
2779 SDL_PixelFormat *df = surface->map->dst->format;
2781 if(sf->Amask == 0) {
2782 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2783 if(df->BytesPerPixel == 1)
2784 return BlitNto1SurfaceAlphaKey;
2786 #if SDL_ALTIVEC_BLITTERS
2787 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2788 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2789 return Blit32to32SurfaceAlphaKeyAltivec;
2792 return BlitNtoNSurfaceAlphaKey;
2794 /* Per-surface alpha blits */
2795 switch(df->BytesPerPixel) {
2797 return BlitNto1SurfaceAlpha;
2800 if(surface->map->identity) {
2801 if(df->Gmask == 0x7e0)
2805 return Blit565to565SurfaceAlphaMMX;
2808 return Blit565to565SurfaceAlpha;
2810 else if(df->Gmask == 0x3e0)
2814 return Blit555to555SurfaceAlphaMMX;
2817 return Blit555to555SurfaceAlpha;
2820 return BlitNtoNSurfaceAlpha;
2823 if(sf->Rmask == df->Rmask
2824 && sf->Gmask == df->Gmask
2825 && sf->Bmask == df->Bmask
2826 && sf->BytesPerPixel == 4)
2829 if(sf->Rshift % 8 == 0
2830 && sf->Gshift % 8 == 0
2831 && sf->Bshift % 8 == 0
2833 return BlitRGBtoRGBSurfaceAlphaMMX;
2835 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2837 #if SDL_ALTIVEC_BLITTERS
2838 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2839 && SDL_HasAltiVec())
2840 return BlitRGBtoRGBSurfaceAlphaAltivec;
2842 return BlitRGBtoRGBSurfaceAlpha;
2845 #if SDL_ALTIVEC_BLITTERS
2846 if((sf->BytesPerPixel == 4) &&
2847 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2848 return Blit32to32SurfaceAlphaAltivec;
2851 return BlitNtoNSurfaceAlpha;
2855 return BlitNtoNSurfaceAlpha;
2859 /* Per-pixel alpha blits */
2860 switch(df->BytesPerPixel) {
2862 return BlitNto1PixelAlpha;
2865 #if SDL_ALTIVEC_BLITTERS
2866 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2867 df->Gmask == 0x7e0 &&
2868 df->Bmask == 0x1f && SDL_HasAltiVec())
2869 return Blit32to565PixelAlphaAltivec;
2872 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2873 && sf->Gmask == 0xff00
2874 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2875 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2876 if(df->Gmask == 0x7e0)
2877 return BlitARGBto565PixelAlpha;
2878 else if(df->Gmask == 0x3e0)
2879 return BlitARGBto555PixelAlpha;
2881 return BlitNtoNPixelAlpha;
2884 if(sf->Rmask == df->Rmask
2885 && sf->Gmask == df->Gmask
2886 && sf->Bmask == df->Bmask
2887 && sf->BytesPerPixel == 4)
2890 if(sf->Rshift % 8 == 0
2891 && sf->Gshift % 8 == 0
2892 && sf->Bshift % 8 == 0
2893 && sf->Ashift % 8 == 0
2897 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2899 return BlitRGBtoRGBPixelAlphaMMX;
2902 if(sf->Amask == 0xff000000)
2904 #if SDL_ALTIVEC_BLITTERS
2905 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2906 && SDL_HasAltiVec())
2907 return BlitRGBtoRGBPixelAlphaAltivec;
2910 return BlitARGBtoXRGBalpha_neon;
2912 return BlitRGBtoRGBPixelAlpha;
2916 if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 &&
2917 ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) ||
2918 (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000)))
2920 return BlitABGRtoXRGBalpha_neon;
2923 #if SDL_ALTIVEC_BLITTERS
2924 if (sf->Amask && sf->BytesPerPixel == 4 &&
2925 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2926 return Blit32to32PixelAlphaAltivec;
2929 return BlitNtoNPixelAlpha;
2933 return BlitNtoNPixelAlpha;