2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "SDL_config.h"
24 #include "SDL_video.h"
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
51 #endif /* SDL_ASSEMBLY_ROUTINES */
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
62 /* Functions to perform alpha blended blitting */
66 /* NEON optimized blitter callers */
67 #define make_neon_caller(name, neon_name) \
68 extern void neon_name(void *dst, const void *src, int count); \
69 static void name(SDL_BlitInfo *info) \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int dstBpp = info->dst->BytesPerPixel; \
76 int srcstride = width * 4 + info->s_skip; \
77 int dststride = width * dstBpp + info->d_skip; \
79 while ( height-- ) { \
80 __builtin_prefetch(dst + dststride); \
81 neon_name(dst, src, width); \
87 #define make_neon_callerS(name, neon_name) \
88 extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
89 static void name(SDL_BlitInfo *info) \
91 int width = info->d_width; \
92 int height = info->d_height; \
93 Uint8 *src = info->s_pixels; \
94 Uint8 *dst = info->d_pixels; \
95 int srcskip = info->s_skip; \
96 int dstskip = info->d_skip; \
97 unsigned alpha = info->src->alpha;\
99 while ( height-- ) { \
100 neon_name(dst, src, width, alpha); \
101 src += width * 4 + srcskip; \
102 dst += width * 4 + dstskip; \
106 make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
107 make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
108 make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
109 make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
110 make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
111 make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
113 #endif /* __ARM_NEON__ */
115 /* N->1 blending with per-surface alpha */
116 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
118 int width = info->d_width;
119 int height = info->d_height;
120 Uint8 *src = info->s_pixels;
121 int srcskip = info->s_skip;
122 Uint8 *dst = info->d_pixels;
123 int dstskip = info->d_skip;
124 Uint8 *palmap = info->table;
125 SDL_PixelFormat *srcfmt = info->src;
126 SDL_PixelFormat *dstfmt = info->dst;
127 int srcbpp = srcfmt->BytesPerPixel;
129 const unsigned A = srcfmt->alpha;
141 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
142 dR = dstfmt->palette->colors[*dst].r;
143 dG = dstfmt->palette->colors[*dst].g;
144 dB = dstfmt->palette->colors[*dst].b;
145 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
149 /* Pack RGB into 8bit pixel */
150 if ( palmap == NULL ) {
151 *dst =((dR>>5)<<(3+2))|
155 *dst = palmap[((dR>>5)<<(3+2))|
168 /* N->1 blending with pixel alpha */
169 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
171 int width = info->d_width;
172 int height = info->d_height;
173 Uint8 *src = info->s_pixels;
174 int srcskip = info->s_skip;
175 Uint8 *dst = info->d_pixels;
176 int dstskip = info->d_skip;
177 Uint8 *palmap = info->table;
178 SDL_PixelFormat *srcfmt = info->src;
179 SDL_PixelFormat *dstfmt = info->dst;
180 int srcbpp = srcfmt->BytesPerPixel;
182 /* FIXME: fix alpha bit field expansion here too? */
194 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
195 dR = dstfmt->palette->colors[*dst].r;
196 dG = dstfmt->palette->colors[*dst].g;
197 dB = dstfmt->palette->colors[*dst].b;
198 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
202 /* Pack RGB into 8bit pixel */
203 if ( palmap == NULL ) {
204 *dst =((dR>>5)<<(3+2))|
208 *dst = palmap[((dR>>5)<<(3+2))|
221 /* colorkeyed N->1 blending with per-surface alpha */
222 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
224 int width = info->d_width;
225 int height = info->d_height;
226 Uint8 *src = info->s_pixels;
227 int srcskip = info->s_skip;
228 Uint8 *dst = info->d_pixels;
229 int dstskip = info->d_skip;
230 Uint8 *palmap = info->table;
231 SDL_PixelFormat *srcfmt = info->src;
232 SDL_PixelFormat *dstfmt = info->dst;
233 int srcbpp = srcfmt->BytesPerPixel;
234 Uint32 ckey = srcfmt->colorkey;
236 const int A = srcfmt->alpha;
248 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
249 if ( Pixel != ckey ) {
250 dR = dstfmt->palette->colors[*dst].r;
251 dG = dstfmt->palette->colors[*dst].g;
252 dB = dstfmt->palette->colors[*dst].b;
253 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
257 /* Pack RGB into 8bit pixel */
258 if ( palmap == NULL ) {
259 *dst =((dR>>5)<<(3+2))|
263 *dst = palmap[((dR>>5)<<(3+2))|
278 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
279 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
281 int width = info->d_width;
282 int height = info->d_height;
283 Uint32 *srcp = (Uint32 *)info->s_pixels;
284 int srcskip = info->s_skip >> 2;
285 Uint32 *dstp = (Uint32 *)info->d_pixels;
286 int dstskip = info->d_skip >> 2;
287 Uint32 dalpha = info->dst->Amask;
290 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
291 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
292 load = 0x0001010100010101ULL;/* !alpha128 mask */
293 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
294 movd_m2r(dalpha, mm7); /* dst alpha mask */
295 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
301 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
302 + (s & d & 0x00010101)) | dalpha;
304 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
305 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
307 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
308 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
310 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
311 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
312 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
313 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
314 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
315 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
316 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
318 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
319 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
329 /* fast RGB888->(A)RGB888 blending with surface alpha */
330 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
332 SDL_PixelFormat* df = info->dst;
333 unsigned alpha = info->src->alpha;
335 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
336 /* only call a128 version when R,G,B occupy lower bits */
337 BlitRGBtoRGBSurfaceAlpha128MMX(info);
339 int width = info->d_width;
340 int height = info->d_height;
341 Uint32 *srcp = (Uint32 *)info->s_pixels;
342 int srcskip = info->s_skip >> 2;
343 Uint32 *dstp = (Uint32 *)info->d_pixels;
344 int dstskip = info->d_skip >> 2;
346 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
347 /* form the alpha mult */
348 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
349 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
350 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
351 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
352 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
353 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
354 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
355 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
356 movd_m2r(df->Amask, mm7); /* dst alpha mask */
357 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
361 /* One Pixel Blend */
362 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
363 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
364 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
365 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
367 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
368 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
369 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
370 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
372 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
373 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
374 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
378 /* Two Pixels Blend */
379 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
380 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
381 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
382 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
384 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
385 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
386 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
387 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
389 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
390 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
391 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
392 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
394 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
395 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
396 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
397 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
399 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
400 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
402 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
414 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
415 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
417 int width = info->d_width;
418 int height = info->d_height;
419 Uint32 *srcp = (Uint32 *)info->s_pixels;
420 int srcskip = info->s_skip >> 2;
421 Uint32 *dstp = (Uint32 *)info->d_pixels;
422 int dstskip = info->d_skip >> 2;
423 SDL_PixelFormat* sf = info->src;
424 Uint32 amask = sf->Amask;
426 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
427 /* form multiplication mask */
428 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
429 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
430 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
431 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
432 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
433 /* form channel masks */
434 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
435 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
436 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
437 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
438 /* get alpha channel shift */
439 __asm__ __volatile__ (
441 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
445 Uint32 alpha = *srcp & amask;
446 /* FIXME: Here we special-case opaque alpha since the
447 compositioning used (>>8 instead of /255) doesn't handle
448 it correctly. Also special-case alpha=0 for speed?
452 } else if(alpha == amask) {
453 /* opaque alpha -- copy RGB, keep dst alpha */
454 /* using MMX here to free up regular registers for other things */
455 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
456 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
457 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
458 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
459 por_r2r(mm1, mm2); /* src | dst -> mm2 */
460 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
462 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
463 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
465 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
466 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
468 __asm__ __volatile__ (
470 : : "r" (alpha) ); /* 0000A000 -> mm4 */
471 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
472 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
473 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
474 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
477 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
478 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
479 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
480 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
482 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
483 movd_r2m(mm2, *dstp);/* mm2 -> dst */
493 /* End GCC_ASMBLIT */
496 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
497 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
499 int width = info->d_width;
500 int height = info->d_height;
501 Uint32 *srcp = (Uint32 *)info->s_pixels;
502 int srcskip = info->s_skip >> 2;
503 Uint32 *dstp = (Uint32 *)info->d_pixels;
504 int dstskip = info->d_skip >> 2;
505 Uint32 dalpha = info->dst->Amask;
507 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
509 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
510 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
511 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
518 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
519 + (s & d & 0x00010101)) | dalpha;
523 for (n >>= 1; n > 0; --n) {
524 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
525 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
527 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
528 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
530 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
531 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
532 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
533 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
535 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
536 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
537 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
538 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
540 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
551 /* fast RGB888->(A)RGB888 blending with surface alpha */
552 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
554 SDL_PixelFormat* df = info->dst;
555 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
556 unsigned alpha = info->src->alpha;
558 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
559 /* only call a128 version when R,G,B occupy lower bits */
560 BlitRGBtoRGBSurfaceAlpha128MMX(info);
562 int width = info->d_width;
563 int height = info->d_height;
564 Uint32 *srcp = (Uint32 *)info->s_pixels;
565 int srcskip = info->s_skip >> 2;
566 Uint32 *dstp = (Uint32 *)info->d_pixels;
567 int dstskip = info->d_skip >> 2;
568 Uint32 dalpha = df->Amask;
571 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
573 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
574 /* form the alpha mult */
575 amult = alpha | (alpha << 8);
576 amult = amult | (amult << 16);
577 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
578 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
579 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
580 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
581 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
586 /* One Pixel Blend */
587 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
588 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
590 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
591 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
593 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
594 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
595 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
596 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
598 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
599 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
600 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
608 for (n >>= 1; n > 0; --n) {
609 /* Two Pixels Blend */
610 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
611 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
612 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
613 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
615 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
616 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
617 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
618 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
620 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
621 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
622 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
623 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
625 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
626 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
627 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
628 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
630 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
631 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
633 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
645 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
646 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
648 int width = info->d_width;
649 int height = info->d_height;
650 Uint32 *srcp = (Uint32 *)info->s_pixels;
651 int srcskip = info->s_skip >> 2;
652 Uint32 *dstp = (Uint32 *)info->d_pixels;
653 int dstskip = info->d_skip >> 2;
654 SDL_PixelFormat* sf = info->src;
655 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
656 Uint32 amask = sf->Amask;
657 Uint32 ashift = sf->Ashift;
660 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
662 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
663 multmask = ~(0xFFFFi64 << (ashift * 2));
664 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
668 Uint32 alpha = *srcp & amask;
671 } else if (alpha == amask) {
672 /* opaque alpha -- copy RGB, keep dst alpha */
673 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
675 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
676 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
678 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
679 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
681 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
682 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
683 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
684 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
685 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
688 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
689 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
690 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
691 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
692 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
694 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
704 /* End MSVC_ASMBLIT */
706 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
708 #if SDL_ALTIVEC_BLITTERS
710 #pragma altivec_model on
717 #if (defined(__MACOSX__) && (__GNUC__ < 4))
718 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
719 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
720 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
721 (vector unsigned short) ( a,b,c,d,e,f,g,h )
723 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
724 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
725 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
726 (vector unsigned short) { a,b,c,d,e,f,g,h }
729 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
730 #define VECPRINT(msg, v) do { \
731 vector unsigned int tmpvec = (vector unsigned int)(v); \
732 unsigned int *vp = (unsigned int *)&tmpvec; \
733 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
736 /* the permuation vector that takes the high bytes out of all the appropriate shorts
737 (vector unsigned char)(
738 0x00, 0x10, 0x02, 0x12,
739 0x04, 0x14, 0x06, 0x16,
740 0x08, 0x18, 0x0A, 0x1A,
741 0x0C, 0x1C, 0x0E, 0x1E );
743 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
744 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
745 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
746 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
748 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
751 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
752 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
753 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
754 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
755 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
756 /* valpha2 is 255-alpha */ \
757 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
758 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
759 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
760 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
761 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
762 /* add source and dest */ \
763 vtemp1 = vec_add(vtemp1, vtemp3); \
764 vtemp2 = vec_add(vtemp2, vtemp4); \
765 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
766 vtemp1 = vec_add(vtemp1, v1_16); \
767 vtemp3 = vec_sr(vtemp1, v8_16); \
768 vtemp1 = vec_add(vtemp1, vtemp3); \
769 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
770 vtemp2 = vec_add(vtemp2, v1_16); \
771 vtemp4 = vec_sr(vtemp2, v8_16); \
772 vtemp2 = vec_add(vtemp2, vtemp4); \
773 /* (>>8) and get ARGBARGBARGBARGB */ \
774 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
777 /* Calculate the permute vector used for 32->32 swizzling */
778 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
779 const SDL_PixelFormat *dstfmt)
782 * We have to assume that the bits that aren't used by other
783 * colors is alpha, and it's one complete byte, since some formats
784 * leave alpha with a zero mask, but we should still swizzle the bits.
787 const static struct SDL_PixelFormat default_pixel_format = {
791 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
794 srcfmt = &default_pixel_format;
797 dstfmt = &default_pixel_format;
799 const vector unsigned char plus = VECUINT8_LITERAL
800 ( 0x00, 0x00, 0x00, 0x00,
801 0x04, 0x04, 0x04, 0x04,
802 0x08, 0x08, 0x08, 0x08,
803 0x0C, 0x0C, 0x0C, 0x0C );
804 vector unsigned char vswiz;
805 vector unsigned int srcvec;
806 #define RESHIFT(X) (3 - ((X) >> 3))
807 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
808 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
809 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
811 /* Use zero for alpha if either surface doesn't have alpha */
813 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
815 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
818 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
819 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
823 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
825 int height = info->d_height;
826 Uint8 *src = (Uint8 *)info->s_pixels;
827 int srcskip = info->s_skip;
828 Uint8 *dst = (Uint8 *)info->d_pixels;
829 int dstskip = info->d_skip;
830 SDL_PixelFormat *srcfmt = info->src;
832 vector unsigned char v0 = vec_splat_u8(0);
833 vector unsigned short v8_16 = vec_splat_u16(8);
834 vector unsigned short v1_16 = vec_splat_u16(1);
835 vector unsigned short v2_16 = vec_splat_u16(2);
836 vector unsigned short v3_16 = vec_splat_u16(3);
837 vector unsigned int v8_32 = vec_splat_u32(8);
838 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
839 vector unsigned short v3f = VECUINT16_LITERAL(
840 0x003f, 0x003f, 0x003f, 0x003f,
841 0x003f, 0x003f, 0x003f, 0x003f);
842 vector unsigned short vfc = VECUINT16_LITERAL(
843 0x00fc, 0x00fc, 0x00fc, 0x00fc,
844 0x00fc, 0x00fc, 0x00fc, 0x00fc);
847 0x10 - 0x1f is the alpha
848 0x00 - 0x0e evens are the red
849 0x01 - 0x0f odds are zero
851 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
852 0x10, 0x00, 0x01, 0x01,
853 0x10, 0x02, 0x01, 0x01,
854 0x10, 0x04, 0x01, 0x01,
855 0x10, 0x06, 0x01, 0x01
857 vector unsigned char vredalpha2 = (vector unsigned char)(
858 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
861 0x00 - 0x0f is ARxx ARxx ARxx ARxx
862 0x11 - 0x0f odds are blue
864 vector unsigned char vblue1 = VECUINT8_LITERAL(
865 0x00, 0x01, 0x02, 0x11,
866 0x04, 0x05, 0x06, 0x13,
867 0x08, 0x09, 0x0a, 0x15,
868 0x0c, 0x0d, 0x0e, 0x17
870 vector unsigned char vblue2 = (vector unsigned char)(
871 vec_add((vector unsigned int)vblue1, v8_32)
874 0x00 - 0x0f is ARxB ARxB ARxB ARxB
875 0x10 - 0x0e evens are green
877 vector unsigned char vgreen1 = VECUINT8_LITERAL(
878 0x00, 0x01, 0x10, 0x03,
879 0x04, 0x05, 0x12, 0x07,
880 0x08, 0x09, 0x14, 0x0b,
881 0x0c, 0x0d, 0x16, 0x0f
883 vector unsigned char vgreen2 = (vector unsigned char)(
884 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
886 vector unsigned char vgmerge = VECUINT8_LITERAL(
887 0x00, 0x02, 0x00, 0x06,
888 0x00, 0x0a, 0x00, 0x0e,
889 0x00, 0x12, 0x00, 0x16,
890 0x00, 0x1a, 0x00, 0x1e);
891 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
892 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
893 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
895 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
896 vf800 = vec_sl(vf800, vec_splat_u16(8));
900 vector unsigned char valigner;
901 vector unsigned char vsrc;
902 vector unsigned char voverflow;
903 int width = info->d_width;
905 #define ONE_PIXEL_BLEND(condition, widthvar) \
906 while (condition) { \
908 unsigned sR, sG, sB, dR, dG, dB, sA; \
909 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
911 unsigned short dstpixel = *((unsigned short *)dst); \
912 dR = (dstpixel >> 8) & 0xf8; \
913 dG = (dstpixel >> 3) & 0xfc; \
914 dB = (dstpixel << 3) & 0xf8; \
915 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
916 *((unsigned short *)dst) = ( \
917 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
924 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
925 extrawidth = (width % 8);
926 valigner = VEC_ALIGNER(src);
927 vsrc = (vector unsigned char)vec_ld(0, src);
930 vector unsigned char valpha;
931 vector unsigned char vsrc1, vsrc2;
932 vector unsigned char vdst1, vdst2;
933 vector unsigned short vR, vG, vB;
934 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
936 /* Load 8 pixels from src as ARGB */
937 voverflow = (vector unsigned char)vec_ld(15, src);
938 vsrc = vec_perm(vsrc, voverflow, valigner);
939 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
941 vsrc = (vector unsigned char)vec_ld(15, src);
942 voverflow = vec_perm(voverflow, vsrc, valigner);
943 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
946 /* Load 8 pixels from dst as XRGB */
947 voverflow = vec_ld(0, dst);
948 vR = vec_and((vector unsigned short)voverflow, vf800);
949 vB = vec_sl((vector unsigned short)voverflow, v3_16);
950 vG = vec_sl(vB, v2_16);
951 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
952 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
953 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
954 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
955 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
956 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
958 /* Alpha blend 8 pixels as ARGB */
959 valpha = vec_perm(vsrc1, v0, valphaPermute);
960 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
961 valpha = vec_perm(vsrc2, v0, valphaPermute);
962 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
964 /* Convert 8 pixels to 565 */
965 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
966 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
967 vgpixel = vec_and(vgpixel, vfc);
968 vgpixel = vec_sl(vgpixel, v3_16);
969 vrpixel = vec_sl(vpixel, v1_16);
970 vrpixel = vec_and(vrpixel, vf800);
971 vbpixel = vec_and(vpixel, v3f);
972 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
973 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
976 vec_st(vdst1, 0, dst);
981 ONE_PIXEL_BLEND((extrawidth), extrawidth);
982 #undef ONE_PIXEL_BLEND
988 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
990 unsigned alpha = info->src->alpha;
991 int height = info->d_height;
992 Uint32 *srcp = (Uint32 *)info->s_pixels;
993 int srcskip = info->s_skip >> 2;
994 Uint32 *dstp = (Uint32 *)info->d_pixels;
995 int dstskip = info->d_skip >> 2;
996 SDL_PixelFormat *srcfmt = info->src;
997 SDL_PixelFormat *dstfmt = info->dst;
998 unsigned sA = srcfmt->alpha;
999 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1000 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1001 Uint32 ckey = info->src->colorkey;
1002 vector unsigned char mergePermute;
1003 vector unsigned char vsrcPermute;
1004 vector unsigned char vdstPermute;
1005 vector unsigned char vsdstPermute;
1006 vector unsigned char valpha;
1007 vector unsigned char valphamask;
1008 vector unsigned char vbits;
1009 vector unsigned char v0;
1010 vector unsigned short v1;
1011 vector unsigned short v8;
1012 vector unsigned int vckey;
1013 vector unsigned int vrgbmask;
1015 mergePermute = VEC_MERGE_PERMUTE();
1016 v0 = vec_splat_u8(0);
1017 v1 = vec_splat_u16(1);
1018 v8 = vec_splat_u16(8);
1020 /* set the alpha to 255 on the destination surf */
1021 valphamask = VEC_ALPHA_MASK();
1023 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1024 vdstPermute = calc_swizzle32(NULL, dstfmt);
1025 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1027 /* set a vector full of alpha and 255-alpha */
1028 ((unsigned char *)&valpha)[0] = alpha;
1029 valpha = vec_splat(valpha, 0);
1030 vbits = (vector unsigned char)vec_splat_s8(-1);
1033 ((unsigned int *)(char*)&vckey)[0] = ckey;
1034 vckey = vec_splat(vckey, 0);
1035 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1036 vrgbmask = vec_splat(vrgbmask, 0);
1039 int width = info->d_width;
1040 #define ONE_PIXEL_BLEND(condition, widthvar) \
1041 while (condition) { \
1043 unsigned sR, sG, sB, dR, dG, dB; \
1044 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1045 if(sA && Pixel != ckey) { \
1046 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1047 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1048 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1049 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1055 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1057 int extrawidth = (width % 4);
1058 vector unsigned char valigner = VEC_ALIGNER(srcp);
1059 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1060 width -= extrawidth;
1062 vector unsigned char vsel;
1063 vector unsigned char voverflow;
1064 vector unsigned char vd;
1065 vector unsigned char vd_orig;
1068 voverflow = (vector unsigned char)vec_ld(15, srcp);
1069 vs = vec_perm(vs, voverflow, valigner);
1071 /* vsel is set for items that match the key */
1072 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1073 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1075 /* permute to source format */
1076 vs = vec_perm(vs, valpha, vsrcPermute);
1079 vd = (vector unsigned char)vec_ld(0, dstp);
1080 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1082 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1084 /* set the alpha channel to full on */
1085 vd = vec_or(vd, valphamask);
1087 /* mask out color key */
1088 vd = vec_sel(vd, vd_orig, vsel);
1090 /* permute to dest format */
1091 vd = vec_perm(vd, vbits, vdstPermute);
1094 vec_st((vector unsigned int)vd, 0, dstp);
1101 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1103 #undef ONE_PIXEL_BLEND
1111 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1113 int width = info->d_width;
1114 int height = info->d_height;
1115 Uint32 *srcp = (Uint32 *)info->s_pixels;
1116 int srcskip = info->s_skip >> 2;
1117 Uint32 *dstp = (Uint32 *)info->d_pixels;
1118 int dstskip = info->d_skip >> 2;
1119 SDL_PixelFormat *srcfmt = info->src;
1120 SDL_PixelFormat *dstfmt = info->dst;
1121 vector unsigned char mergePermute;
1122 vector unsigned char valphaPermute;
1123 vector unsigned char vsrcPermute;
1124 vector unsigned char vdstPermute;
1125 vector unsigned char vsdstPermute;
1126 vector unsigned char valphamask;
1127 vector unsigned char vpixelmask;
1128 vector unsigned char v0;
1129 vector unsigned short v1;
1130 vector unsigned short v8;
1132 v0 = vec_splat_u8(0);
1133 v1 = vec_splat_u16(1);
1134 v8 = vec_splat_u16(8);
1135 mergePermute = VEC_MERGE_PERMUTE();
1136 valphamask = VEC_ALPHA_MASK();
1137 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1138 vpixelmask = vec_nor(valphamask, v0);
1139 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1140 vdstPermute = calc_swizzle32(NULL, dstfmt);
1141 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1143 while ( height-- ) {
1144 width = info->d_width;
1145 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1147 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1148 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1150 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1151 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1152 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1158 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1162 int extrawidth = (width % 4);
1163 vector unsigned char valigner = VEC_ALIGNER(srcp);
1164 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1165 width -= extrawidth;
1167 vector unsigned char voverflow;
1168 vector unsigned char vd;
1169 vector unsigned char valpha;
1170 vector unsigned char vdstalpha;
1172 voverflow = (vector unsigned char)vec_ld(15, srcp);
1173 vs = vec_perm(vs, voverflow, valigner);
1174 vs = vec_perm(vs, v0, vsrcPermute);
1176 valpha = vec_perm(vs, v0, valphaPermute);
1179 vd = (vector unsigned char)vec_ld(0, dstp);
1180 vd = vec_perm(vd, v0, vsdstPermute);
1181 vdstalpha = vec_and(vd, valphamask);
1183 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1185 /* set the alpha to the dest alpha */
1186 vd = vec_and(vd, vpixelmask);
1187 vd = vec_or(vd, vdstalpha);
1188 vd = vec_perm(vd, v0, vdstPermute);
1191 vec_st((vector unsigned int)vd, 0, dstp);
1199 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1203 #undef ONE_PIXEL_BLEND
1207 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1208 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1210 int width = info->d_width;
1211 int height = info->d_height;
1212 Uint32 *srcp = (Uint32 *)info->s_pixels;
1213 int srcskip = info->s_skip >> 2;
1214 Uint32 *dstp = (Uint32 *)info->d_pixels;
1215 int dstskip = info->d_skip >> 2;
1216 vector unsigned char mergePermute;
1217 vector unsigned char valphaPermute;
1218 vector unsigned char valphamask;
1219 vector unsigned char vpixelmask;
1220 vector unsigned char v0;
1221 vector unsigned short v1;
1222 vector unsigned short v8;
1223 v0 = vec_splat_u8(0);
1224 v1 = vec_splat_u16(1);
1225 v8 = vec_splat_u16(8);
1226 mergePermute = VEC_MERGE_PERMUTE();
1227 valphamask = VEC_ALPHA_MASK();
1228 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1231 vpixelmask = vec_nor(valphamask, v0);
1233 width = info->d_width;
1234 #define ONE_PIXEL_BLEND(condition, widthvar) \
1235 while ((condition)) { \
1241 Uint32 alpha = s >> 24; \
1243 if(alpha == SDL_ALPHA_OPAQUE) { \
1244 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1247 dalpha = d & 0xff000000; \
1248 s1 = s & 0xff00ff; \
1249 d1 = d & 0xff00ff; \
1250 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1253 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1254 *dstp = d1 | d | dalpha; \
1261 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1263 int extrawidth = (width % 4);
1264 vector unsigned char valigner = VEC_ALIGNER(srcp);
1265 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1266 width -= extrawidth;
1268 vector unsigned char voverflow;
1269 vector unsigned char vd;
1270 vector unsigned char valpha;
1271 vector unsigned char vdstalpha;
1273 voverflow = (vector unsigned char)vec_ld(15, srcp);
1274 vs = vec_perm(vs, voverflow, valigner);
1276 valpha = vec_perm(vs, v0, valphaPermute);
1279 vd = (vector unsigned char)vec_ld(0, dstp);
1280 vdstalpha = vec_and(vd, valphamask);
1282 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1284 /* set the alpha to the dest alpha */
1285 vd = vec_and(vd, vpixelmask);
1286 vd = vec_or(vd, vdstalpha);
1289 vec_st((vector unsigned int)vd, 0, dstp);
1296 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1301 #undef ONE_PIXEL_BLEND
1304 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1307 unsigned alpha = info->src->alpha;
1308 int height = info->d_height;
1309 Uint32 *srcp = (Uint32 *)info->s_pixels;
1310 int srcskip = info->s_skip >> 2;
1311 Uint32 *dstp = (Uint32 *)info->d_pixels;
1312 int dstskip = info->d_skip >> 2;
1313 SDL_PixelFormat *srcfmt = info->src;
1314 SDL_PixelFormat *dstfmt = info->dst;
1315 unsigned sA = srcfmt->alpha;
1316 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1317 vector unsigned char mergePermute;
1318 vector unsigned char vsrcPermute;
1319 vector unsigned char vdstPermute;
1320 vector unsigned char vsdstPermute;
1321 vector unsigned char valpha;
1322 vector unsigned char valphamask;
1323 vector unsigned char vbits;
1324 vector unsigned short v1;
1325 vector unsigned short v8;
1327 mergePermute = VEC_MERGE_PERMUTE();
1328 v1 = vec_splat_u16(1);
1329 v8 = vec_splat_u16(8);
1331 /* set the alpha to 255 on the destination surf */
1332 valphamask = VEC_ALPHA_MASK();
1334 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1335 vdstPermute = calc_swizzle32(NULL, dstfmt);
1336 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1338 /* set a vector full of alpha and 255-alpha */
1339 ((unsigned char *)&valpha)[0] = alpha;
1340 valpha = vec_splat(valpha, 0);
1341 vbits = (vector unsigned char)vec_splat_s8(-1);
1344 int width = info->d_width;
1345 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1347 unsigned sR, sG, sB, dR, dG, dB; \
1348 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1349 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1350 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1351 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1356 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1358 int extrawidth = (width % 4);
1359 vector unsigned char valigner = VEC_ALIGNER(srcp);
1360 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1361 width -= extrawidth;
1363 vector unsigned char voverflow;
1364 vector unsigned char vd;
1367 voverflow = (vector unsigned char)vec_ld(15, srcp);
1368 vs = vec_perm(vs, voverflow, valigner);
1369 vs = vec_perm(vs, valpha, vsrcPermute);
1372 vd = (vector unsigned char)vec_ld(0, dstp);
1373 vd = vec_perm(vd, vd, vsdstPermute);
1375 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1377 /* set the alpha channel to full on */
1378 vd = vec_or(vd, valphamask);
1379 vd = vec_perm(vd, vbits, vdstPermute);
1382 vec_st((vector unsigned int)vd, 0, dstp);
1389 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1391 #undef ONE_PIXEL_BLEND
1400 /* fast RGB888->(A)RGB888 blending */
1401 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1403 unsigned alpha = info->src->alpha;
1404 int height = info->d_height;
1405 Uint32 *srcp = (Uint32 *)info->s_pixels;
1406 int srcskip = info->s_skip >> 2;
1407 Uint32 *dstp = (Uint32 *)info->d_pixels;
1408 int dstskip = info->d_skip >> 2;
1409 vector unsigned char mergePermute;
1410 vector unsigned char valpha;
1411 vector unsigned char valphamask;
1412 vector unsigned short v1;
1413 vector unsigned short v8;
1415 mergePermute = VEC_MERGE_PERMUTE();
1416 v1 = vec_splat_u16(1);
1417 v8 = vec_splat_u16(8);
1419 /* set the alpha to 255 on the destination surf */
1420 valphamask = VEC_ALPHA_MASK();
1422 /* set a vector full of alpha and 255-alpha */
1423 ((unsigned char *)&valpha)[0] = alpha;
1424 valpha = vec_splat(valpha, 0);
1427 int width = info->d_width;
1428 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1431 Uint32 s1 = s & 0xff00ff; \
1432 Uint32 d1 = d & 0xff00ff; \
1433 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1437 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1438 *dstp = d1 | d | 0xff000000; \
1443 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1445 int extrawidth = (width % 4);
1446 vector unsigned char valigner = VEC_ALIGNER(srcp);
1447 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1448 width -= extrawidth;
1450 vector unsigned char voverflow;
1451 vector unsigned char vd;
1454 voverflow = (vector unsigned char)vec_ld(15, srcp);
1455 vs = vec_perm(vs, voverflow, valigner);
1458 vd = (vector unsigned char)vec_ld(0, dstp);
1460 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1462 /* set the alpha channel to full on */
1463 vd = vec_or(vd, valphamask);
1466 vec_st((vector unsigned int)vd, 0, dstp);
1473 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1475 #undef ONE_PIXEL_BLEND
1482 #pragma altivec_model off
1484 #endif /* SDL_ALTIVEC_BLITTERS */
1486 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1487 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1489 int width = info->d_width;
1490 int height = info->d_height;
1491 Uint32 *srcp = (Uint32 *)info->s_pixels;
1492 int srcskip = info->s_skip >> 2;
1493 Uint32 *dstp = (Uint32 *)info->d_pixels;
1494 int dstskip = info->d_skip >> 2;
1500 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1501 + (s & d & 0x00010101)) | 0xff000000;
1508 /* fast RGB888->(A)RGB888 blending with surface alpha */
1509 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1511 unsigned alpha = info->src->alpha;
1513 BlitRGBtoRGBSurfaceAlpha128(info);
1515 int width = info->d_width;
1516 int height = info->d_height;
1517 Uint32 *srcp = (Uint32 *)info->s_pixels;
1518 int srcskip = info->s_skip >> 2;
1519 Uint32 *dstp = (Uint32 *)info->d_pixels;
1520 int dstskip = info->d_skip >> 2;
1527 DUFFS_LOOP_DOUBLE2({
1528 /* One Pixel Blend */
1533 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1537 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1538 *dstp = d1 | d | 0xff000000;
1542 /* Two Pixels Blend */
1547 d1 += (s1 - d1) * alpha >> 8;
1550 s = ((s & 0xff00) >> 8) |
1551 ((srcp[1] & 0xff00) << 8);
1552 d = ((d & 0xff00) >> 8) |
1553 ((dstp[1] & 0xff00) << 8);
1554 d += (s - d) * alpha >> 8;
1557 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1564 d1 += (s1 - d1) * alpha >> 8;
1567 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1577 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1578 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1580 int width = info->d_width;
1581 int height = info->d_height;
1582 Uint32 *srcp = (Uint32 *)info->s_pixels;
1583 int srcskip = info->s_skip >> 2;
1584 Uint32 *dstp = (Uint32 *)info->d_pixels;
1585 int dstskip = info->d_skip >> 2;
1594 Uint32 alpha = s >> 24;
1595 /* FIXME: Here we special-case opaque alpha since the
1596 compositioning used (>>8 instead of /255) doesn't handle
1597 it correctly. Also special-case alpha=0 for speed?
1600 if(alpha == SDL_ALPHA_OPAQUE) {
1601 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1604 * take out the middle component (green), and process
1605 * the other two in parallel. One multiply less.
1608 dalpha = d & 0xff000000;
1611 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1614 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1615 *dstp = d1 | d | dalpha;
1627 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1628 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1630 int width = info->d_width;
1631 int height = info->d_height;
1632 Uint32 *srcp = (Uint32 *)info->s_pixels;
1633 int srcskip = info->s_skip >> 2;
1634 Uint32 *dstp = (Uint32 *)info->d_pixels;
1635 int dstskip = info->d_skip >> 2;
1636 SDL_PixelFormat* sf = info->src;
1637 Uint32 amask = sf->Amask;
1640 /* make mm6 all zeros. */
1641 "pxor %%mm6, %%mm6\n"
1643 /* Make a mask to preserve the alpha. */
1644 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1645 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1646 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1647 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1648 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1650 /* form channel masks */
1651 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1652 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1653 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1654 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1656 /* get alpha channel shift */
1657 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1659 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1669 : : "r" (srcp), "r" (dstp) );
1671 alpha = *srcp & amask;
1672 /* FIXME: Here we special-case opaque alpha since the
1673 compositioning used (>>8 instead of /255) doesn't handle
1674 it correctly. Also special-case alpha=0 for speed?
1679 else if(alpha == amask) {
1680 /* opaque alpha -- copy RGB, keep dst alpha */
1681 /* using MMX here to free up regular registers for other things */
1683 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1684 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1685 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1686 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1687 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1688 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1690 : : "r" (srcp), "r" (dstp) );
1695 /* load in the source, and dst. */
1696 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1697 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1699 /* Move the src alpha into mm2 */
1701 /* if supporting pshufw */
1702 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1703 /*"psrlw $8, %%mm2\n" */
1707 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1708 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1709 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1710 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1712 /* move the colors into words. */
1713 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1714 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1717 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1720 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1721 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1722 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1724 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1726 "movd %%mm0, (%1)\n" /* result in mm0 */
1728 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1742 /* End GCC_ASMBLIT*/
1745 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1746 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1748 int width = info->d_width;
1749 int height = info->d_height;
1750 Uint32 *srcp = (Uint32 *)info->s_pixels;
1751 int srcskip = info->s_skip >> 2;
1752 Uint32 *dstp = (Uint32 *)info->d_pixels;
1753 int dstskip = info->d_skip >> 2;
1754 SDL_PixelFormat* sf = info->src;
1755 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1756 Uint32 amask = sf->Amask;
1757 Uint32 ashift = sf->Ashift;
1760 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1762 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1763 multmask = ~(0xFFFFi64 << (ashift * 2));
1764 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1770 _m_prefetch(srcp + 16);
1771 _m_prefetch(dstp + 16);
1773 alpha = *srcp & amask;
1776 } else if (alpha == amask) {
1777 /* copy RGB, keep dst alpha */
1778 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1780 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1781 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1783 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1784 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1786 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1787 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1788 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1789 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1790 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1793 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1794 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1795 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1796 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1797 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1799 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1809 /* End MSVC_ASMBLIT */
1811 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1813 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1815 /* blend a single 16 bit pixel at 50% */
1816 #define BLEND16_50(d, s, mask) \
1817 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1819 /* blend two 16 bit pixels at 50% */
1820 #define BLEND2x16_50(d, s, mask) \
1821 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1822 + (s & d & (~(mask | mask << 16))))
1824 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1826 int width = info->d_width;
1827 int height = info->d_height;
1828 Uint16 *srcp = (Uint16 *)info->s_pixels;
1829 int srcskip = info->s_skip >> 1;
1830 Uint16 *dstp = (Uint16 *)info->d_pixels;
1831 int dstskip = info->d_skip >> 1;
1834 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1836 * Source and destination not aligned, pipeline it.
1837 * This is mostly a win for big blits but no loss for
1843 /* handle odd destination */
1844 if((uintptr_t)dstp & 2) {
1845 Uint16 d = *dstp, s = *srcp;
1846 *dstp = BLEND16_50(d, s, mask);
1851 srcp++; /* srcp is now 32-bit aligned */
1853 /* bootstrap pipeline with first halfword */
1854 prev_sw = ((Uint32 *)srcp)[-1];
1858 sw = *(Uint32 *)srcp;
1859 dw = *(Uint32 *)dstp;
1860 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1861 s = (prev_sw << 16) + (sw >> 16);
1863 s = (prev_sw >> 16) + (sw << 16);
1866 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1872 /* final pixel if any */
1874 Uint16 d = *dstp, s;
1875 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1876 s = (Uint16)prev_sw;
1878 s = (Uint16)(prev_sw >> 16);
1880 *dstp = BLEND16_50(d, s, mask);
1884 srcp += srcskip - 1;
1887 /* source and destination are aligned */
1890 /* first odd pixel? */
1891 if((uintptr_t)srcp & 2) {
1892 Uint16 d = *dstp, s = *srcp;
1893 *dstp = BLEND16_50(d, s, mask);
1898 /* srcp and dstp are now 32-bit aligned */
1901 Uint32 sw = *(Uint32 *)srcp;
1902 Uint32 dw = *(Uint32 *)dstp;
1903 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1909 /* last odd pixel? */
1911 Uint16 d = *dstp, s = *srcp;
1912 *dstp = BLEND16_50(d, s, mask);
1923 /* fast RGB565->RGB565 blending with surface alpha */
1924 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1926 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1928 Blit16to16SurfaceAlpha128(info, 0xf7de);
1930 int width = info->d_width;
1931 int height = info->d_height;
1932 Uint16 *srcp = (Uint16 *)info->s_pixels;
1933 int srcskip = info->s_skip >> 1;
1934 Uint16 *dstp = (Uint16 *)info->d_pixels;
1935 int dstskip = info->d_skip >> 1;
1939 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1941 alpha >>= 3; /* downscale alpha to 5 bits */
1943 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1944 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1945 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1946 /* position alpha to allow for mullo and mulhi on diff channels
1947 to reduce the number of operations */
1950 /* Setup the 565 color channel masks */
1951 load = 0x07E007E007E007E0ULL;
1952 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1953 load = 0x001F001F001F001FULL;
1954 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1961 * shift out the middle component (green) to
1962 * the high 16 bits, and process all three RGB
1963 * components at the same time.
1965 s = (s | s << 16) & 0x07e0f81f;
1966 d = (d | d << 16) & 0x07e0f81f;
1967 d += (s - d) * alpha >> 5;
1969 *dstp++ = d | d >> 16;
1974 * shift out the middle component (green) to
1975 * the high 16 bits, and process all three RGB
1976 * components at the same time.
1978 s = (s | s << 16) & 0x07e0f81f;
1979 d = (d | d << 16) & 0x07e0f81f;
1980 d += (s - d) * alpha >> 5;
1982 *dstp++ = d | d >> 16;
1986 * shift out the middle component (green) to
1987 * the high 16 bits, and process all three RGB
1988 * components at the same time.
1990 s = (s | s << 16) & 0x07e0f81f;
1991 d = (d | d << 16) & 0x07e0f81f;
1992 d += (s - d) * alpha >> 5;
1994 *dstp++ = d | d >> 16;
1996 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1997 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1999 /* red -- does not need a mask since the right shift clears
2000 the uninteresting bits */
2001 movq_r2r(mm2, mm5); /* src -> mm5 */
2002 movq_r2r(mm3, mm6); /* dst -> mm6 */
2003 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2004 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2007 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2008 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2009 /* alpha used is actually 11 bits
2010 11 + 5 = 16 bits, so the sign bits are lost */
2011 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2012 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2013 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2015 movq_r2r(mm6, mm1); /* save new reds in dsts */
2017 /* green -- process the bits in place */
2018 movq_r2r(mm2, mm5); /* src -> mm5 */
2019 movq_r2r(mm3, mm6); /* dst -> mm6 */
2020 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2021 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2024 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2025 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2026 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2027 bits are gone and the sign bits present */
2028 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2029 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2031 por_r2r(mm6, mm1); /* save new greens in dsts */
2034 movq_r2r(mm2, mm5); /* src -> mm5 */
2035 movq_r2r(mm3, mm6); /* dst -> mm6 */
2036 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2037 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2040 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2041 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2042 /* 11 + 5 = 16 bits, so the sign bits are lost and
2043 the interesting bits will need to be MASKed */
2044 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2045 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2046 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2048 por_r2r(mm6, mm1); /* save new blues in dsts */
2050 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2062 /* fast RGB555->RGB555 blending with surface alpha */
2063 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2065 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2067 Blit16to16SurfaceAlpha128(info, 0xfbde);
2069 int width = info->d_width;
2070 int height = info->d_height;
2071 Uint16 *srcp = (Uint16 *)info->s_pixels;
2072 int srcskip = info->s_skip >> 1;
2073 Uint16 *dstp = (Uint16 *)info->d_pixels;
2074 int dstskip = info->d_skip >> 1;
2078 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2080 alpha >>= 3; /* downscale alpha to 5 bits */
2082 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2083 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2084 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2085 /* position alpha to allow for mullo and mulhi on diff channels
2086 to reduce the number of operations */
2089 /* Setup the 555 color channel masks */
2090 load = 0x03E003E003E003E0ULL;
2091 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2092 load = 0x001F001F001F001FULL;
2093 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2100 * shift out the middle component (green) to
2101 * the high 16 bits, and process all three RGB
2102 * components at the same time.
2104 s = (s | s << 16) & 0x03e07c1f;
2105 d = (d | d << 16) & 0x03e07c1f;
2106 d += (s - d) * alpha >> 5;
2108 *dstp++ = d | d >> 16;
2113 * shift out the middle component (green) to
2114 * the high 16 bits, and process all three RGB
2115 * components at the same time.
2117 s = (s | s << 16) & 0x03e07c1f;
2118 d = (d | d << 16) & 0x03e07c1f;
2119 d += (s - d) * alpha >> 5;
2121 *dstp++ = d | d >> 16;
2125 * shift out the middle component (green) to
2126 * the high 16 bits, and process all three RGB
2127 * components at the same time.
2129 s = (s | s << 16) & 0x03e07c1f;
2130 d = (d | d << 16) & 0x03e07c1f;
2131 d += (s - d) * alpha >> 5;
2133 *dstp++ = d | d >> 16;
2135 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2136 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2138 /* red -- process the bits in place */
2139 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2140 /* by reusing the GREEN mask we free up another mmx
2141 register to accumulate the result */
2143 movq_r2r(mm2, mm5); /* src -> mm5 */
2144 movq_r2r(mm3, mm6); /* dst -> mm6 */
2145 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2146 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2149 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2150 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2151 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2152 cleared by a MASK below */
2153 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2154 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2155 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2157 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2159 movq_r2r(mm6, mm1); /* save new reds in dsts */
2161 /* green -- process the bits in place */
2162 movq_r2r(mm2, mm5); /* src -> mm5 */
2163 movq_r2r(mm3, mm6); /* dst -> mm6 */
2164 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2165 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2168 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2169 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2170 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2171 bits are gone and the sign bits present */
2172 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2173 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2175 por_r2r(mm6, mm1); /* save new greens in dsts */
2178 movq_r2r(mm2, mm5); /* src -> mm5 */
2179 movq_r2r(mm3, mm6); /* dst -> mm6 */
2180 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2181 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2184 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2185 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2186 /* 11 + 5 = 16 bits, so the sign bits are lost and
2187 the interesting bits will need to be MASKed */
2188 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2189 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2190 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2192 por_r2r(mm6, mm1); /* save new blues in dsts */
2194 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2205 /* End GCC_ASMBLIT */
2208 /* fast RGB565->RGB565 blending with surface alpha */
2209 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2211 unsigned alpha = info->src->alpha;
2213 Blit16to16SurfaceAlpha128(info, 0xf7de);
2215 int width = info->d_width;
2216 int height = info->d_height;
2217 Uint16 *srcp = (Uint16 *)info->s_pixels;
2218 int srcskip = info->s_skip >> 1;
2219 Uint16 *dstp = (Uint16 *)info->d_pixels;
2220 int dstskip = info->d_skip >> 1;
2223 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2225 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2226 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2227 alpha >>= 3; /* downscale alpha to 5 bits */
2229 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2230 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2231 /* position alpha to allow for mullo and mulhi on diff channels
2232 to reduce the number of operations */
2233 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2235 /* Setup the 565 color channel masks */
2236 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2237 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2245 * shift out the middle component (green) to
2246 * the high 16 bits, and process all three RGB
2247 * components at the same time.
2249 s = (s | s << 16) & 0x07e0f81f;
2250 d = (d | d << 16) & 0x07e0f81f;
2251 d += (s - d) * alpha >> 5;
2253 *dstp++ = (Uint16)(d | d >> 16);
2258 * shift out the middle component (green) to
2259 * the high 16 bits, and process all three RGB
2260 * components at the same time.
2262 s = (s | s << 16) & 0x07e0f81f;
2263 d = (d | d << 16) & 0x07e0f81f;
2264 d += (s - d) * alpha >> 5;
2266 *dstp++ = (Uint16)(d | d >> 16);
2270 * shift out the middle component (green) to
2271 * the high 16 bits, and process all three RGB
2272 * components at the same time.
2274 s = (s | s << 16) & 0x07e0f81f;
2275 d = (d | d << 16) & 0x07e0f81f;
2276 d += (s - d) * alpha >> 5;
2278 *dstp++ = (Uint16)(d | d >> 16);
2280 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2281 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2285 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2288 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2291 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2292 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2293 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2294 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2295 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2297 mm_res = dst2; /* RED -> mm_res */
2299 /* green -- process the bits in place */
2301 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2304 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2307 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2308 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2309 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2310 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2312 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2316 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2319 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2322 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2323 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2324 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2325 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2326 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2328 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2330 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2342 /* fast RGB555->RGB555 blending with surface alpha */
2343 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2345 unsigned alpha = info->src->alpha;
2347 Blit16to16SurfaceAlpha128(info, 0xfbde);
2349 int width = info->d_width;
2350 int height = info->d_height;
2351 Uint16 *srcp = (Uint16 *)info->s_pixels;
2352 int srcskip = info->s_skip >> 1;
2353 Uint16 *dstp = (Uint16 *)info->d_pixels;
2354 int dstskip = info->d_skip >> 1;
2357 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2359 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2360 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2361 alpha >>= 3; /* downscale alpha to 5 bits */
2363 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2364 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2365 /* position alpha to allow for mullo and mulhi on diff channels
2366 to reduce the number of operations */
2367 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2369 /* Setup the 555 color channel masks */
2370 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2371 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2372 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2380 * shift out the middle component (green) to
2381 * the high 16 bits, and process all three RGB
2382 * components at the same time.
2384 s = (s | s << 16) & 0x03e07c1f;
2385 d = (d | d << 16) & 0x03e07c1f;
2386 d += (s - d) * alpha >> 5;
2388 *dstp++ = (Uint16)(d | d >> 16);
2393 * shift out the middle component (green) to
2394 * the high 16 bits, and process all three RGB
2395 * components at the same time.
2397 s = (s | s << 16) & 0x03e07c1f;
2398 d = (d | d << 16) & 0x03e07c1f;
2399 d += (s - d) * alpha >> 5;
2401 *dstp++ = (Uint16)(d | d >> 16);
2405 * shift out the middle component (green) to
2406 * the high 16 bits, and process all three RGB
2407 * components at the same time.
2409 s = (s | s << 16) & 0x03e07c1f;
2410 d = (d | d << 16) & 0x03e07c1f;
2411 d += (s - d) * alpha >> 5;
2413 *dstp++ = (Uint16)(d | d >> 16);
2415 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2416 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2418 /* red -- process the bits in place */
2420 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2423 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2426 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2427 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2428 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2429 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2430 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2432 mm_res = dst2; /* RED -> mm_res */
2434 /* green -- process the bits in place */
2436 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2439 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2442 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2443 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2444 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2445 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2447 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2450 src2 = src1; /* src -> src2 */
2451 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2453 dst2 = dst1; /* dst -> dst2 */
2454 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2457 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2458 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2459 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2460 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2461 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2463 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2465 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2476 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2478 /* fast RGB565->RGB565 blending with surface alpha */
2479 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2481 unsigned alpha = info->src->alpha;
2483 Blit16to16SurfaceAlpha128(info, 0xf7de);
2485 int width = info->d_width;
2486 int height = info->d_height;
2487 Uint16 *srcp = (Uint16 *)info->s_pixels;
2488 int srcskip = info->s_skip >> 1;
2489 Uint16 *dstp = (Uint16 *)info->d_pixels;
2490 int dstskip = info->d_skip >> 1;
2491 alpha >>= 3; /* downscale alpha to 5 bits */
2498 * shift out the middle component (green) to
2499 * the high 16 bits, and process all three RGB
2500 * components at the same time.
2502 s = (s | s << 16) & 0x07e0f81f;
2503 d = (d | d << 16) & 0x07e0f81f;
2504 d += (s - d) * alpha >> 5;
2506 *dstp++ = (Uint16)(d | d >> 16);
2514 /* fast RGB555->RGB555 blending with surface alpha */
2515 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2517 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2519 Blit16to16SurfaceAlpha128(info, 0xfbde);
2521 int width = info->d_width;
2522 int height = info->d_height;
2523 Uint16 *srcp = (Uint16 *)info->s_pixels;
2524 int srcskip = info->s_skip >> 1;
2525 Uint16 *dstp = (Uint16 *)info->d_pixels;
2526 int dstskip = info->d_skip >> 1;
2527 alpha >>= 3; /* downscale alpha to 5 bits */
2534 * shift out the middle component (green) to
2535 * the high 16 bits, and process all three RGB
2536 * components at the same time.
2538 s = (s | s << 16) & 0x03e07c1f;
2539 d = (d | d << 16) & 0x03e07c1f;
2540 d += (s - d) * alpha >> 5;
2542 *dstp++ = (Uint16)(d | d >> 16);
2550 /* fast ARGB8888->RGB565 blending with pixel alpha */
2551 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2553 int width = info->d_width;
2554 int height = info->d_height;
2555 Uint32 *srcp = (Uint32 *)info->s_pixels;
2556 int srcskip = info->s_skip >> 2;
2557 Uint16 *dstp = (Uint16 *)info->d_pixels;
2558 int dstskip = info->d_skip >> 1;
2563 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2564 /* FIXME: Here we special-case opaque alpha since the
2565 compositioning used (>>8 instead of /255) doesn't handle
2566 it correctly. Also special-case alpha=0 for speed?
2569 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2570 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2574 * convert source and destination to G0RAB65565
2575 * and blend all components at the same time
2577 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2579 d = (d | d << 16) & 0x07e0f81f;
2580 d += (s - d) * alpha >> 5;
2582 *dstp = (Uint16)(d | d >> 16);
2593 /* fast ARGB8888->RGB555 blending with pixel alpha */
2594 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2596 int width = info->d_width;
2597 int height = info->d_height;
2598 Uint32 *srcp = (Uint32 *)info->s_pixels;
2599 int srcskip = info->s_skip >> 2;
2600 Uint16 *dstp = (Uint16 *)info->d_pixels;
2601 int dstskip = info->d_skip >> 1;
2607 alpha = s >> 27; /* downscale alpha to 5 bits */
2608 /* FIXME: Here we special-case opaque alpha since the
2609 compositioning used (>>8 instead of /255) doesn't handle
2610 it correctly. Also special-case alpha=0 for speed?
2613 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2614 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2618 * convert source and destination to G0RAB65565
2619 * and blend all components at the same time
2621 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2623 d = (d | d << 16) & 0x03e07c1f;
2624 d += (s - d) * alpha >> 5;
2626 *dstp = (Uint16)(d | d >> 16);
2637 /* General (slow) N->N blending with per-surface alpha */
2638 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2640 int width = info->d_width;
2641 int height = info->d_height;
2642 Uint8 *src = info->s_pixels;
2643 int srcskip = info->s_skip;
2644 Uint8 *dst = info->d_pixels;
2645 int dstskip = info->d_skip;
2646 SDL_PixelFormat *srcfmt = info->src;
2647 SDL_PixelFormat *dstfmt = info->dst;
2648 int srcbpp = srcfmt->BytesPerPixel;
2649 int dstbpp = dstfmt->BytesPerPixel;
2650 unsigned sA = srcfmt->alpha;
2651 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2654 while ( height-- ) {
2664 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2665 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2666 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2667 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2678 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2679 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2681 int width = info->d_width;
2682 int height = info->d_height;
2683 Uint8 *src = info->s_pixels;
2684 int srcskip = info->s_skip;
2685 Uint8 *dst = info->d_pixels;
2686 int dstskip = info->d_skip;
2687 SDL_PixelFormat *srcfmt = info->src;
2688 SDL_PixelFormat *dstfmt = info->dst;
2689 Uint32 ckey = srcfmt->colorkey;
2690 int srcbpp = srcfmt->BytesPerPixel;
2691 int dstbpp = dstfmt->BytesPerPixel;
2692 unsigned sA = srcfmt->alpha;
2693 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2695 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2696 Uint16 *src16 = (Uint16 *)src;
2697 Uint16 *dst16 = (Uint16 *)dst;
2698 sA >>= 3; /* downscale alpha to 5 bits */
2699 while ( height-- ) {
2705 if(sA && s != ckey) {
2707 s = (s | s << 16) & 0x07e0f81f;
2708 d = (d | d << 16) & 0x07e0f81f;
2709 d += (s - d) * sA >> 5;
2711 *dst16 = (Uint16)(d | d >> 16);
2717 src16 += srcskip / 2;
2718 dst16 += dstskip / 2;
2723 while ( height-- ) {
2733 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2734 if(sA && Pixel != ckey) {
2735 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2736 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2737 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2738 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2749 /* General (slow) N->N blending with pixel alpha */
2750 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2752 int width = info->d_width;
2753 int height = info->d_height;
2754 Uint8 *src = info->s_pixels;
2755 int srcskip = info->s_skip;
2756 Uint8 *dst = info->d_pixels;
2757 int dstskip = info->d_skip;
2758 SDL_PixelFormat *srcfmt = info->src;
2759 SDL_PixelFormat *dstfmt = info->dst;
2764 /* Set up some basic variables */
2765 srcbpp = srcfmt->BytesPerPixel;
2766 dstbpp = dstfmt->BytesPerPixel;
2768 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2769 quite right. for <8bpp source alpha, it gets them very wrong
2771 It is unclear whether there is a good general solution that doesn't
2772 need a branch (or a divide). */
2773 while ( height-- ) {
2785 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2787 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2788 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2789 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2801 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2803 SDL_PixelFormat *sf = surface->format;
2804 SDL_PixelFormat *df = surface->map->dst->format;
2806 if(sf->Amask == 0) {
2807 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2808 if(df->BytesPerPixel == 1)
2809 return BlitNto1SurfaceAlphaKey;
2811 #if SDL_ALTIVEC_BLITTERS
2812 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2813 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2814 return Blit32to32SurfaceAlphaKeyAltivec;
2817 return BlitNtoNSurfaceAlphaKey;
2819 /* Per-surface alpha blits */
2820 switch(df->BytesPerPixel) {
2822 return BlitNto1SurfaceAlpha;
2825 if(surface->map->identity) {
2826 if(df->Gmask == 0x7e0)
2830 return Blit565to565SurfaceAlphaMMX;
2833 return Blit565to565SurfaceAlpha;
2835 else if(df->Gmask == 0x3e0)
2839 return Blit555to555SurfaceAlphaMMX;
2842 return Blit555to555SurfaceAlpha;
2845 return BlitNtoNSurfaceAlpha;
2848 if(sf->Rmask == df->Rmask
2849 && sf->Gmask == df->Gmask
2850 && sf->Bmask == df->Bmask
2851 && sf->BytesPerPixel == 4)
2854 if(sf->Rshift % 8 == 0
2855 && sf->Gshift % 8 == 0
2856 && sf->Bshift % 8 == 0
2858 return BlitRGBtoRGBSurfaceAlphaMMX;
2861 if(sf->Rshift % 8 == 0
2862 && sf->Gshift % 8 == 0
2863 && sf->Bshift % 8 == 0)
2865 return BlitARGBtoXRGBalphaS_neon;
2868 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2870 #if SDL_ALTIVEC_BLITTERS
2871 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2872 && SDL_HasAltiVec())
2873 return BlitRGBtoRGBSurfaceAlphaAltivec;
2875 return BlitRGBtoRGBSurfaceAlpha;
2879 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2880 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2882 return BlitABGRtoXRGBalphaS_neon;
2885 #if SDL_ALTIVEC_BLITTERS
2886 if((sf->BytesPerPixel == 4) &&
2887 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2888 return Blit32to32SurfaceAlphaAltivec;
2891 return BlitNtoNSurfaceAlpha;
2895 return BlitNtoNSurfaceAlpha;
2899 /* Per-pixel alpha blits */
2900 switch(df->BytesPerPixel) {
2902 return BlitNto1PixelAlpha;
2905 #if SDL_ALTIVEC_BLITTERS
2906 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2907 df->Gmask == 0x7e0 &&
2908 df->Bmask == 0x1f && SDL_HasAltiVec())
2909 return Blit32to565PixelAlphaAltivec;
2913 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2914 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2915 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2916 return BlitARGBtoRGB565alpha_neon;
2918 return BlitABGRtoRGB565alpha_neon;
2922 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2923 && sf->Gmask == 0xff00
2924 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2925 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2926 if(df->Gmask == 0x7e0)
2927 return BlitARGBto565PixelAlpha;
2928 else if(df->Gmask == 0x3e0)
2929 return BlitARGBto555PixelAlpha;
2931 return BlitNtoNPixelAlpha;
2934 if(sf->Rmask == df->Rmask
2935 && sf->Gmask == df->Gmask
2936 && sf->Bmask == df->Bmask
2937 && sf->BytesPerPixel == 4)
2940 if(sf->Rshift % 8 == 0
2941 && sf->Gshift % 8 == 0
2942 && sf->Bshift % 8 == 0
2943 && sf->Ashift % 8 == 0
2947 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2949 return BlitRGBtoRGBPixelAlphaMMX;
2953 if(sf->Rshift % 8 == 0
2954 && sf->Gshift % 8 == 0
2955 && sf->Bshift % 8 == 0
2956 && sf->Ashift % 8 == 0)
2958 return BlitARGBtoXRGBalpha_neon;
2961 if(sf->Amask == 0xff000000)
2963 #if SDL_ALTIVEC_BLITTERS
2964 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2965 && SDL_HasAltiVec())
2966 return BlitRGBtoRGBPixelAlphaAltivec;
2968 return BlitRGBtoRGBPixelAlpha;
2972 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2973 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2974 && sf->Amask == 0xff000000)
2976 return BlitABGRtoXRGBalpha_neon;
2979 #if SDL_ALTIVEC_BLITTERS
2980 if (sf->Amask && sf->BytesPerPixel == 4 &&
2981 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2982 return Blit32to32PixelAlphaAltivec;
2985 return BlitNtoNPixelAlpha;
2989 return BlitNtoNPixelAlpha;