2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "SDL_config.h"
24 #include "SDL_video.h"
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
51 #endif /* SDL_ASSEMBLY_ROUTINES */
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
62 /* Functions to perform alpha blended blitting */
64 /* N->1 blending with per-surface alpha */
65 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
67 int width = info->d_width;
68 int height = info->d_height;
69 Uint8 *src = info->s_pixels;
70 int srcskip = info->s_skip;
71 Uint8 *dst = info->d_pixels;
72 int dstskip = info->d_skip;
73 Uint8 *palmap = info->table;
74 SDL_PixelFormat *srcfmt = info->src;
75 SDL_PixelFormat *dstfmt = info->dst;
76 int srcbpp = srcfmt->BytesPerPixel;
78 const unsigned A = srcfmt->alpha;
90 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
91 dR = dstfmt->palette->colors[*dst].r;
92 dG = dstfmt->palette->colors[*dst].g;
93 dB = dstfmt->palette->colors[*dst].b;
94 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
98 /* Pack RGB into 8bit pixel */
99 if ( palmap == NULL ) {
100 *dst =((dR>>5)<<(3+2))|
104 *dst = palmap[((dR>>5)<<(3+2))|
117 /* N->1 blending with pixel alpha */
118 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
120 int width = info->d_width;
121 int height = info->d_height;
122 Uint8 *src = info->s_pixels;
123 int srcskip = info->s_skip;
124 Uint8 *dst = info->d_pixels;
125 int dstskip = info->d_skip;
126 Uint8 *palmap = info->table;
127 SDL_PixelFormat *srcfmt = info->src;
128 SDL_PixelFormat *dstfmt = info->dst;
129 int srcbpp = srcfmt->BytesPerPixel;
131 /* FIXME: fix alpha bit field expansion here too? */
143 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
144 dR = dstfmt->palette->colors[*dst].r;
145 dG = dstfmt->palette->colors[*dst].g;
146 dB = dstfmt->palette->colors[*dst].b;
147 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
151 /* Pack RGB into 8bit pixel */
152 if ( palmap == NULL ) {
153 *dst =((dR>>5)<<(3+2))|
157 *dst = palmap[((dR>>5)<<(3+2))|
170 /* colorkeyed N->1 blending with per-surface alpha */
171 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
173 int width = info->d_width;
174 int height = info->d_height;
175 Uint8 *src = info->s_pixels;
176 int srcskip = info->s_skip;
177 Uint8 *dst = info->d_pixels;
178 int dstskip = info->d_skip;
179 Uint8 *palmap = info->table;
180 SDL_PixelFormat *srcfmt = info->src;
181 SDL_PixelFormat *dstfmt = info->dst;
182 int srcbpp = srcfmt->BytesPerPixel;
183 Uint32 ckey = srcfmt->colorkey;
185 const int A = srcfmt->alpha;
197 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
198 if ( Pixel != ckey ) {
199 dR = dstfmt->palette->colors[*dst].r;
200 dG = dstfmt->palette->colors[*dst].g;
201 dB = dstfmt->palette->colors[*dst].b;
202 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
206 /* Pack RGB into 8bit pixel */
207 if ( palmap == NULL ) {
208 *dst =((dR>>5)<<(3+2))|
212 *dst = palmap[((dR>>5)<<(3+2))|
227 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
228 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
230 int width = info->d_width;
231 int height = info->d_height;
232 Uint32 *srcp = (Uint32 *)info->s_pixels;
233 int srcskip = info->s_skip >> 2;
234 Uint32 *dstp = (Uint32 *)info->d_pixels;
235 int dstskip = info->d_skip >> 2;
236 Uint32 dalpha = info->dst->Amask;
239 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
240 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
241 load = 0x0001010100010101ULL;/* !alpha128 mask */
242 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
243 movd_m2r(dalpha, mm7); /* dst alpha mask */
244 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
250 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
251 + (s & d & 0x00010101)) | dalpha;
253 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
254 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
256 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
257 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
259 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
260 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
261 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
262 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
263 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
264 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
265 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
267 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
268 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
278 /* fast RGB888->(A)RGB888 blending with surface alpha */
279 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
281 SDL_PixelFormat* df = info->dst;
282 unsigned alpha = info->src->alpha;
284 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
285 /* only call a128 version when R,G,B occupy lower bits */
286 BlitRGBtoRGBSurfaceAlpha128MMX(info);
288 int width = info->d_width;
289 int height = info->d_height;
290 Uint32 *srcp = (Uint32 *)info->s_pixels;
291 int srcskip = info->s_skip >> 2;
292 Uint32 *dstp = (Uint32 *)info->d_pixels;
293 int dstskip = info->d_skip >> 2;
295 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
296 /* form the alpha mult */
297 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
298 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
299 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
300 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
301 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
302 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
303 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
304 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
305 movd_m2r(df->Amask, mm7); /* dst alpha mask */
306 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
310 /* One Pixel Blend */
311 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
312 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
313 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
314 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
316 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
317 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
318 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
319 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
321 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
322 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
323 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
327 /* Two Pixels Blend */
328 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
329 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
330 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
331 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
333 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
334 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
335 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
336 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
338 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
339 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
340 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
341 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
343 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
344 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
345 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
346 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
348 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
349 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
351 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
363 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
364 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
366 int width = info->d_width;
367 int height = info->d_height;
368 Uint32 *srcp = (Uint32 *)info->s_pixels;
369 int srcskip = info->s_skip >> 2;
370 Uint32 *dstp = (Uint32 *)info->d_pixels;
371 int dstskip = info->d_skip >> 2;
372 SDL_PixelFormat* sf = info->src;
373 Uint32 amask = sf->Amask;
375 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
376 /* form multiplication mask */
377 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
378 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
379 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
380 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
381 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
382 /* form channel masks */
383 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
384 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
385 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
386 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
387 /* get alpha channel shift */
388 __asm__ __volatile__ (
390 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
394 Uint32 alpha = *srcp & amask;
395 /* FIXME: Here we special-case opaque alpha since the
396 compositioning used (>>8 instead of /255) doesn't handle
397 it correctly. Also special-case alpha=0 for speed?
401 } else if(alpha == amask) {
402 /* opaque alpha -- copy RGB, keep dst alpha */
403 /* using MMX here to free up regular registers for other things */
404 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
405 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
406 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
407 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
408 por_r2r(mm1, mm2); /* src | dst -> mm2 */
409 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
411 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
412 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
414 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
415 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
417 __asm__ __volatile__ (
419 : : "r" (alpha) ); /* 0000A000 -> mm4 */
420 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
421 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
422 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
423 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
426 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
427 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
428 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
429 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
431 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
432 movd_r2m(mm2, *dstp);/* mm2 -> dst */
442 /* End GCC_ASMBLIT */
445 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
446 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
448 int width = info->d_width;
449 int height = info->d_height;
450 Uint32 *srcp = (Uint32 *)info->s_pixels;
451 int srcskip = info->s_skip >> 2;
452 Uint32 *dstp = (Uint32 *)info->d_pixels;
453 int dstskip = info->d_skip >> 2;
454 Uint32 dalpha = info->dst->Amask;
456 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
458 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
459 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
460 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
467 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
468 + (s & d & 0x00010101)) | dalpha;
472 for (n >>= 1; n > 0; --n) {
473 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
474 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
476 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
477 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
479 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
480 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
481 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
482 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
484 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
485 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
486 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
487 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
489 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
500 /* fast RGB888->(A)RGB888 blending with surface alpha */
501 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
503 SDL_PixelFormat* df = info->dst;
504 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
505 unsigned alpha = info->src->alpha;
507 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
508 /* only call a128 version when R,G,B occupy lower bits */
509 BlitRGBtoRGBSurfaceAlpha128MMX(info);
511 int width = info->d_width;
512 int height = info->d_height;
513 Uint32 *srcp = (Uint32 *)info->s_pixels;
514 int srcskip = info->s_skip >> 2;
515 Uint32 *dstp = (Uint32 *)info->d_pixels;
516 int dstskip = info->d_skip >> 2;
517 Uint32 dalpha = df->Amask;
520 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
522 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
523 /* form the alpha mult */
524 amult = alpha | (alpha << 8);
525 amult = amult | (amult << 16);
526 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
527 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
528 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
529 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
530 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
535 /* One Pixel Blend */
536 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
537 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
539 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
540 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
542 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
543 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
544 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
545 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
547 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
548 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
549 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
557 for (n >>= 1; n > 0; --n) {
558 /* Two Pixels Blend */
559 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
560 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
561 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
562 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
564 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
565 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
566 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
567 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
569 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
570 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
571 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
572 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
574 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
575 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
576 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
577 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
579 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
580 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
582 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
594 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
595 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
597 int width = info->d_width;
598 int height = info->d_height;
599 Uint32 *srcp = (Uint32 *)info->s_pixels;
600 int srcskip = info->s_skip >> 2;
601 Uint32 *dstp = (Uint32 *)info->d_pixels;
602 int dstskip = info->d_skip >> 2;
603 SDL_PixelFormat* sf = info->src;
604 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
605 Uint32 amask = sf->Amask;
606 Uint32 ashift = sf->Ashift;
609 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
611 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
612 multmask = ~(0xFFFFi64 << (ashift * 2));
613 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
617 Uint32 alpha = *srcp & amask;
620 } else if (alpha == amask) {
621 /* opaque alpha -- copy RGB, keep dst alpha */
622 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
624 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
625 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
627 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
628 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
630 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
631 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
632 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
633 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
634 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
637 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
638 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
639 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
640 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
641 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
643 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
653 /* End MSVC_ASMBLIT */
655 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
657 #if SDL_ALTIVEC_BLITTERS
659 #pragma altivec_model on
666 #if (defined(__MACOSX__) && (__GNUC__ < 4))
667 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
668 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
669 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
670 (vector unsigned short) ( a,b,c,d,e,f,g,h )
672 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
673 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
674 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
675 (vector unsigned short) { a,b,c,d,e,f,g,h }
678 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
679 #define VECPRINT(msg, v) do { \
680 vector unsigned int tmpvec = (vector unsigned int)(v); \
681 unsigned int *vp = (unsigned int *)&tmpvec; \
682 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
685 /* the permuation vector that takes the high bytes out of all the appropriate shorts
686 (vector unsigned char)(
687 0x00, 0x10, 0x02, 0x12,
688 0x04, 0x14, 0x06, 0x16,
689 0x08, 0x18, 0x0A, 0x1A,
690 0x0C, 0x1C, 0x0E, 0x1E );
692 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
693 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
694 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
695 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
697 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
700 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
701 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
702 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
703 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
704 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
705 /* valpha2 is 255-alpha */ \
706 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
707 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
708 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
709 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
710 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
711 /* add source and dest */ \
712 vtemp1 = vec_add(vtemp1, vtemp3); \
713 vtemp2 = vec_add(vtemp2, vtemp4); \
714 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
715 vtemp1 = vec_add(vtemp1, v1_16); \
716 vtemp3 = vec_sr(vtemp1, v8_16); \
717 vtemp1 = vec_add(vtemp1, vtemp3); \
718 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
719 vtemp2 = vec_add(vtemp2, v1_16); \
720 vtemp4 = vec_sr(vtemp2, v8_16); \
721 vtemp2 = vec_add(vtemp2, vtemp4); \
722 /* (>>8) and get ARGBARGBARGBARGB */ \
723 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
726 /* Calculate the permute vector used for 32->32 swizzling */
727 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
728 const SDL_PixelFormat *dstfmt)
731 * We have to assume that the bits that aren't used by other
732 * colors is alpha, and it's one complete byte, since some formats
733 * leave alpha with a zero mask, but we should still swizzle the bits.
736 const static struct SDL_PixelFormat default_pixel_format = {
740 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
743 srcfmt = &default_pixel_format;
746 dstfmt = &default_pixel_format;
748 const vector unsigned char plus = VECUINT8_LITERAL
749 ( 0x00, 0x00, 0x00, 0x00,
750 0x04, 0x04, 0x04, 0x04,
751 0x08, 0x08, 0x08, 0x08,
752 0x0C, 0x0C, 0x0C, 0x0C );
753 vector unsigned char vswiz;
754 vector unsigned int srcvec;
755 #define RESHIFT(X) (3 - ((X) >> 3))
756 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
757 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
758 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
760 /* Use zero for alpha if either surface doesn't have alpha */
762 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
764 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
767 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
768 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
772 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
774 int height = info->d_height;
775 Uint8 *src = (Uint8 *)info->s_pixels;
776 int srcskip = info->s_skip;
777 Uint8 *dst = (Uint8 *)info->d_pixels;
778 int dstskip = info->d_skip;
779 SDL_PixelFormat *srcfmt = info->src;
781 vector unsigned char v0 = vec_splat_u8(0);
782 vector unsigned short v8_16 = vec_splat_u16(8);
783 vector unsigned short v1_16 = vec_splat_u16(1);
784 vector unsigned short v2_16 = vec_splat_u16(2);
785 vector unsigned short v3_16 = vec_splat_u16(3);
786 vector unsigned int v8_32 = vec_splat_u32(8);
787 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
788 vector unsigned short v3f = VECUINT16_LITERAL(
789 0x003f, 0x003f, 0x003f, 0x003f,
790 0x003f, 0x003f, 0x003f, 0x003f);
791 vector unsigned short vfc = VECUINT16_LITERAL(
792 0x00fc, 0x00fc, 0x00fc, 0x00fc,
793 0x00fc, 0x00fc, 0x00fc, 0x00fc);
796 0x10 - 0x1f is the alpha
797 0x00 - 0x0e evens are the red
798 0x01 - 0x0f odds are zero
800 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
801 0x10, 0x00, 0x01, 0x01,
802 0x10, 0x02, 0x01, 0x01,
803 0x10, 0x04, 0x01, 0x01,
804 0x10, 0x06, 0x01, 0x01
806 vector unsigned char vredalpha2 = (vector unsigned char)(
807 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
810 0x00 - 0x0f is ARxx ARxx ARxx ARxx
811 0x11 - 0x0f odds are blue
813 vector unsigned char vblue1 = VECUINT8_LITERAL(
814 0x00, 0x01, 0x02, 0x11,
815 0x04, 0x05, 0x06, 0x13,
816 0x08, 0x09, 0x0a, 0x15,
817 0x0c, 0x0d, 0x0e, 0x17
819 vector unsigned char vblue2 = (vector unsigned char)(
820 vec_add((vector unsigned int)vblue1, v8_32)
823 0x00 - 0x0f is ARxB ARxB ARxB ARxB
824 0x10 - 0x0e evens are green
826 vector unsigned char vgreen1 = VECUINT8_LITERAL(
827 0x00, 0x01, 0x10, 0x03,
828 0x04, 0x05, 0x12, 0x07,
829 0x08, 0x09, 0x14, 0x0b,
830 0x0c, 0x0d, 0x16, 0x0f
832 vector unsigned char vgreen2 = (vector unsigned char)(
833 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
835 vector unsigned char vgmerge = VECUINT8_LITERAL(
836 0x00, 0x02, 0x00, 0x06,
837 0x00, 0x0a, 0x00, 0x0e,
838 0x00, 0x12, 0x00, 0x16,
839 0x00, 0x1a, 0x00, 0x1e);
840 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
841 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
842 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
844 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
845 vf800 = vec_sl(vf800, vec_splat_u16(8));
849 vector unsigned char valigner;
850 vector unsigned char vsrc;
851 vector unsigned char voverflow;
852 int width = info->d_width;
854 #define ONE_PIXEL_BLEND(condition, widthvar) \
855 while (condition) { \
857 unsigned sR, sG, sB, dR, dG, dB, sA; \
858 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
860 unsigned short dstpixel = *((unsigned short *)dst); \
861 dR = (dstpixel >> 8) & 0xf8; \
862 dG = (dstpixel >> 3) & 0xfc; \
863 dB = (dstpixel << 3) & 0xf8; \
864 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
865 *((unsigned short *)dst) = ( \
866 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
873 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
874 extrawidth = (width % 8);
875 valigner = VEC_ALIGNER(src);
876 vsrc = (vector unsigned char)vec_ld(0, src);
879 vector unsigned char valpha;
880 vector unsigned char vsrc1, vsrc2;
881 vector unsigned char vdst1, vdst2;
882 vector unsigned short vR, vG, vB;
883 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
885 /* Load 8 pixels from src as ARGB */
886 voverflow = (vector unsigned char)vec_ld(15, src);
887 vsrc = vec_perm(vsrc, voverflow, valigner);
888 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
890 vsrc = (vector unsigned char)vec_ld(15, src);
891 voverflow = vec_perm(voverflow, vsrc, valigner);
892 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
895 /* Load 8 pixels from dst as XRGB */
896 voverflow = vec_ld(0, dst);
897 vR = vec_and((vector unsigned short)voverflow, vf800);
898 vB = vec_sl((vector unsigned short)voverflow, v3_16);
899 vG = vec_sl(vB, v2_16);
900 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
901 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
902 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
903 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
904 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
905 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
907 /* Alpha blend 8 pixels as ARGB */
908 valpha = vec_perm(vsrc1, v0, valphaPermute);
909 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
910 valpha = vec_perm(vsrc2, v0, valphaPermute);
911 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
913 /* Convert 8 pixels to 565 */
914 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
915 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
916 vgpixel = vec_and(vgpixel, vfc);
917 vgpixel = vec_sl(vgpixel, v3_16);
918 vrpixel = vec_sl(vpixel, v1_16);
919 vrpixel = vec_and(vrpixel, vf800);
920 vbpixel = vec_and(vpixel, v3f);
921 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
922 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
925 vec_st(vdst1, 0, dst);
930 ONE_PIXEL_BLEND((extrawidth), extrawidth);
931 #undef ONE_PIXEL_BLEND
937 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
939 unsigned alpha = info->src->alpha;
940 int height = info->d_height;
941 Uint32 *srcp = (Uint32 *)info->s_pixels;
942 int srcskip = info->s_skip >> 2;
943 Uint32 *dstp = (Uint32 *)info->d_pixels;
944 int dstskip = info->d_skip >> 2;
945 SDL_PixelFormat *srcfmt = info->src;
946 SDL_PixelFormat *dstfmt = info->dst;
947 unsigned sA = srcfmt->alpha;
948 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
949 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
950 Uint32 ckey = info->src->colorkey;
951 vector unsigned char mergePermute;
952 vector unsigned char vsrcPermute;
953 vector unsigned char vdstPermute;
954 vector unsigned char vsdstPermute;
955 vector unsigned char valpha;
956 vector unsigned char valphamask;
957 vector unsigned char vbits;
958 vector unsigned char v0;
959 vector unsigned short v1;
960 vector unsigned short v8;
961 vector unsigned int vckey;
962 vector unsigned int vrgbmask;
964 mergePermute = VEC_MERGE_PERMUTE();
965 v0 = vec_splat_u8(0);
966 v1 = vec_splat_u16(1);
967 v8 = vec_splat_u16(8);
969 /* set the alpha to 255 on the destination surf */
970 valphamask = VEC_ALPHA_MASK();
972 vsrcPermute = calc_swizzle32(srcfmt, NULL);
973 vdstPermute = calc_swizzle32(NULL, dstfmt);
974 vsdstPermute = calc_swizzle32(dstfmt, NULL);
976 /* set a vector full of alpha and 255-alpha */
977 ((unsigned char *)&valpha)[0] = alpha;
978 valpha = vec_splat(valpha, 0);
979 vbits = (vector unsigned char)vec_splat_s8(-1);
982 ((unsigned int *)(char*)&vckey)[0] = ckey;
983 vckey = vec_splat(vckey, 0);
984 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
985 vrgbmask = vec_splat(vrgbmask, 0);
988 int width = info->d_width;
989 #define ONE_PIXEL_BLEND(condition, widthvar) \
990 while (condition) { \
992 unsigned sR, sG, sB, dR, dG, dB; \
993 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
994 if(sA && Pixel != ckey) { \
995 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
996 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
997 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
998 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1004 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1006 int extrawidth = (width % 4);
1007 vector unsigned char valigner = VEC_ALIGNER(srcp);
1008 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1009 width -= extrawidth;
1011 vector unsigned char vsel;
1012 vector unsigned char voverflow;
1013 vector unsigned char vd;
1014 vector unsigned char vd_orig;
1017 voverflow = (vector unsigned char)vec_ld(15, srcp);
1018 vs = vec_perm(vs, voverflow, valigner);
1020 /* vsel is set for items that match the key */
1021 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1022 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1024 /* permute to source format */
1025 vs = vec_perm(vs, valpha, vsrcPermute);
1028 vd = (vector unsigned char)vec_ld(0, dstp);
1029 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1031 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1033 /* set the alpha channel to full on */
1034 vd = vec_or(vd, valphamask);
1036 /* mask out color key */
1037 vd = vec_sel(vd, vd_orig, vsel);
1039 /* permute to dest format */
1040 vd = vec_perm(vd, vbits, vdstPermute);
1043 vec_st((vector unsigned int)vd, 0, dstp);
1050 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1052 #undef ONE_PIXEL_BLEND
1060 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1062 int width = info->d_width;
1063 int height = info->d_height;
1064 Uint32 *srcp = (Uint32 *)info->s_pixels;
1065 int srcskip = info->s_skip >> 2;
1066 Uint32 *dstp = (Uint32 *)info->d_pixels;
1067 int dstskip = info->d_skip >> 2;
1068 SDL_PixelFormat *srcfmt = info->src;
1069 SDL_PixelFormat *dstfmt = info->dst;
1070 vector unsigned char mergePermute;
1071 vector unsigned char valphaPermute;
1072 vector unsigned char vsrcPermute;
1073 vector unsigned char vdstPermute;
1074 vector unsigned char vsdstPermute;
1075 vector unsigned char valphamask;
1076 vector unsigned char vpixelmask;
1077 vector unsigned char v0;
1078 vector unsigned short v1;
1079 vector unsigned short v8;
1081 v0 = vec_splat_u8(0);
1082 v1 = vec_splat_u16(1);
1083 v8 = vec_splat_u16(8);
1084 mergePermute = VEC_MERGE_PERMUTE();
1085 valphamask = VEC_ALPHA_MASK();
1086 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1087 vpixelmask = vec_nor(valphamask, v0);
1088 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1089 vdstPermute = calc_swizzle32(NULL, dstfmt);
1090 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1092 while ( height-- ) {
1093 width = info->d_width;
1094 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1096 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1097 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1099 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1100 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1101 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1107 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1111 int extrawidth = (width % 4);
1112 vector unsigned char valigner = VEC_ALIGNER(srcp);
1113 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1114 width -= extrawidth;
1116 vector unsigned char voverflow;
1117 vector unsigned char vd;
1118 vector unsigned char valpha;
1119 vector unsigned char vdstalpha;
1121 voverflow = (vector unsigned char)vec_ld(15, srcp);
1122 vs = vec_perm(vs, voverflow, valigner);
1123 vs = vec_perm(vs, v0, vsrcPermute);
1125 valpha = vec_perm(vs, v0, valphaPermute);
1128 vd = (vector unsigned char)vec_ld(0, dstp);
1129 vd = vec_perm(vd, v0, vsdstPermute);
1130 vdstalpha = vec_and(vd, valphamask);
1132 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1134 /* set the alpha to the dest alpha */
1135 vd = vec_and(vd, vpixelmask);
1136 vd = vec_or(vd, vdstalpha);
1137 vd = vec_perm(vd, v0, vdstPermute);
1140 vec_st((vector unsigned int)vd, 0, dstp);
1148 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1152 #undef ONE_PIXEL_BLEND
1156 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1157 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1159 int width = info->d_width;
1160 int height = info->d_height;
1161 Uint32 *srcp = (Uint32 *)info->s_pixels;
1162 int srcskip = info->s_skip >> 2;
1163 Uint32 *dstp = (Uint32 *)info->d_pixels;
1164 int dstskip = info->d_skip >> 2;
1165 vector unsigned char mergePermute;
1166 vector unsigned char valphaPermute;
1167 vector unsigned char valphamask;
1168 vector unsigned char vpixelmask;
1169 vector unsigned char v0;
1170 vector unsigned short v1;
1171 vector unsigned short v8;
1172 v0 = vec_splat_u8(0);
1173 v1 = vec_splat_u16(1);
1174 v8 = vec_splat_u16(8);
1175 mergePermute = VEC_MERGE_PERMUTE();
1176 valphamask = VEC_ALPHA_MASK();
1177 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1180 vpixelmask = vec_nor(valphamask, v0);
1182 width = info->d_width;
1183 #define ONE_PIXEL_BLEND(condition, widthvar) \
1184 while ((condition)) { \
1190 Uint32 alpha = s >> 24; \
1192 if(alpha == SDL_ALPHA_OPAQUE) { \
1193 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1196 dalpha = d & 0xff000000; \
1197 s1 = s & 0xff00ff; \
1198 d1 = d & 0xff00ff; \
1199 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1202 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1203 *dstp = d1 | d | dalpha; \
1210 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1212 int extrawidth = (width % 4);
1213 vector unsigned char valigner = VEC_ALIGNER(srcp);
1214 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1215 width -= extrawidth;
1217 vector unsigned char voverflow;
1218 vector unsigned char vd;
1219 vector unsigned char valpha;
1220 vector unsigned char vdstalpha;
1222 voverflow = (vector unsigned char)vec_ld(15, srcp);
1223 vs = vec_perm(vs, voverflow, valigner);
1225 valpha = vec_perm(vs, v0, valphaPermute);
1228 vd = (vector unsigned char)vec_ld(0, dstp);
1229 vdstalpha = vec_and(vd, valphamask);
1231 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1233 /* set the alpha to the dest alpha */
1234 vd = vec_and(vd, vpixelmask);
1235 vd = vec_or(vd, vdstalpha);
1238 vec_st((vector unsigned int)vd, 0, dstp);
1245 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1250 #undef ONE_PIXEL_BLEND
1253 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1256 unsigned alpha = info->src->alpha;
1257 int height = info->d_height;
1258 Uint32 *srcp = (Uint32 *)info->s_pixels;
1259 int srcskip = info->s_skip >> 2;
1260 Uint32 *dstp = (Uint32 *)info->d_pixels;
1261 int dstskip = info->d_skip >> 2;
1262 SDL_PixelFormat *srcfmt = info->src;
1263 SDL_PixelFormat *dstfmt = info->dst;
1264 unsigned sA = srcfmt->alpha;
1265 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1266 vector unsigned char mergePermute;
1267 vector unsigned char vsrcPermute;
1268 vector unsigned char vdstPermute;
1269 vector unsigned char vsdstPermute;
1270 vector unsigned char valpha;
1271 vector unsigned char valphamask;
1272 vector unsigned char vbits;
1273 vector unsigned short v1;
1274 vector unsigned short v8;
1276 mergePermute = VEC_MERGE_PERMUTE();
1277 v1 = vec_splat_u16(1);
1278 v8 = vec_splat_u16(8);
1280 /* set the alpha to 255 on the destination surf */
1281 valphamask = VEC_ALPHA_MASK();
1283 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1284 vdstPermute = calc_swizzle32(NULL, dstfmt);
1285 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1287 /* set a vector full of alpha and 255-alpha */
1288 ((unsigned char *)&valpha)[0] = alpha;
1289 valpha = vec_splat(valpha, 0);
1290 vbits = (vector unsigned char)vec_splat_s8(-1);
1293 int width = info->d_width;
1294 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1296 unsigned sR, sG, sB, dR, dG, dB; \
1297 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1298 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1299 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1300 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1305 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1307 int extrawidth = (width % 4);
1308 vector unsigned char valigner = VEC_ALIGNER(srcp);
1309 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1310 width -= extrawidth;
1312 vector unsigned char voverflow;
1313 vector unsigned char vd;
1316 voverflow = (vector unsigned char)vec_ld(15, srcp);
1317 vs = vec_perm(vs, voverflow, valigner);
1318 vs = vec_perm(vs, valpha, vsrcPermute);
1321 vd = (vector unsigned char)vec_ld(0, dstp);
1322 vd = vec_perm(vd, vd, vsdstPermute);
1324 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1326 /* set the alpha channel to full on */
1327 vd = vec_or(vd, valphamask);
1328 vd = vec_perm(vd, vbits, vdstPermute);
1331 vec_st((vector unsigned int)vd, 0, dstp);
1338 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1340 #undef ONE_PIXEL_BLEND
1349 /* fast RGB888->(A)RGB888 blending */
1350 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1352 unsigned alpha = info->src->alpha;
1353 int height = info->d_height;
1354 Uint32 *srcp = (Uint32 *)info->s_pixels;
1355 int srcskip = info->s_skip >> 2;
1356 Uint32 *dstp = (Uint32 *)info->d_pixels;
1357 int dstskip = info->d_skip >> 2;
1358 vector unsigned char mergePermute;
1359 vector unsigned char valpha;
1360 vector unsigned char valphamask;
1361 vector unsigned short v1;
1362 vector unsigned short v8;
1364 mergePermute = VEC_MERGE_PERMUTE();
1365 v1 = vec_splat_u16(1);
1366 v8 = vec_splat_u16(8);
1368 /* set the alpha to 255 on the destination surf */
1369 valphamask = VEC_ALPHA_MASK();
1371 /* set a vector full of alpha and 255-alpha */
1372 ((unsigned char *)&valpha)[0] = alpha;
1373 valpha = vec_splat(valpha, 0);
1376 int width = info->d_width;
1377 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1380 Uint32 s1 = s & 0xff00ff; \
1381 Uint32 d1 = d & 0xff00ff; \
1382 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1386 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1387 *dstp = d1 | d | 0xff000000; \
1392 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1394 int extrawidth = (width % 4);
1395 vector unsigned char valigner = VEC_ALIGNER(srcp);
1396 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1397 width -= extrawidth;
1399 vector unsigned char voverflow;
1400 vector unsigned char vd;
1403 voverflow = (vector unsigned char)vec_ld(15, srcp);
1404 vs = vec_perm(vs, voverflow, valigner);
1407 vd = (vector unsigned char)vec_ld(0, dstp);
1409 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1411 /* set the alpha channel to full on */
1412 vd = vec_or(vd, valphamask);
1415 vec_st((vector unsigned int)vd, 0, dstp);
1422 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1424 #undef ONE_PIXEL_BLEND
1431 #pragma altivec_model off
1433 #endif /* SDL_ALTIVEC_BLITTERS */
1435 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1436 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1438 int width = info->d_width;
1439 int height = info->d_height;
1440 Uint32 *srcp = (Uint32 *)info->s_pixels;
1441 int srcskip = info->s_skip >> 2;
1442 Uint32 *dstp = (Uint32 *)info->d_pixels;
1443 int dstskip = info->d_skip >> 2;
1449 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1450 + (s & d & 0x00010101)) | 0xff000000;
1457 /* fast RGB888->(A)RGB888 blending with surface alpha */
1458 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1460 unsigned alpha = info->src->alpha;
1462 BlitRGBtoRGBSurfaceAlpha128(info);
1464 int width = info->d_width;
1465 int height = info->d_height;
1466 Uint32 *srcp = (Uint32 *)info->s_pixels;
1467 int srcskip = info->s_skip >> 2;
1468 Uint32 *dstp = (Uint32 *)info->d_pixels;
1469 int dstskip = info->d_skip >> 2;
1476 DUFFS_LOOP_DOUBLE2({
1477 /* One Pixel Blend */
1482 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1486 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1487 *dstp = d1 | d | 0xff000000;
1491 /* Two Pixels Blend */
1496 d1 += (s1 - d1) * alpha >> 8;
1499 s = ((s & 0xff00) >> 8) |
1500 ((srcp[1] & 0xff00) << 8);
1501 d = ((d & 0xff00) >> 8) |
1502 ((dstp[1] & 0xff00) << 8);
1503 d += (s - d) * alpha >> 8;
1506 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1513 d1 += (s1 - d1) * alpha >> 8;
1516 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1526 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1527 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1529 int width = info->d_width;
1530 int height = info->d_height;
1531 Uint32 *srcp = (Uint32 *)info->s_pixels;
1532 int srcskip = info->s_skip >> 2;
1533 Uint32 *dstp = (Uint32 *)info->d_pixels;
1534 int dstskip = info->d_skip >> 2;
1543 Uint32 alpha = s >> 24;
1544 /* FIXME: Here we special-case opaque alpha since the
1545 compositioning used (>>8 instead of /255) doesn't handle
1546 it correctly. Also special-case alpha=0 for speed?
1549 if(alpha == SDL_ALPHA_OPAQUE) {
1550 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1553 * take out the middle component (green), and process
1554 * the other two in parallel. One multiply less.
1557 dalpha = d & 0xff000000;
1560 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1563 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1564 *dstp = d1 | d | dalpha;
1576 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1577 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1579 int width = info->d_width;
1580 int height = info->d_height;
1581 Uint32 *srcp = (Uint32 *)info->s_pixels;
1582 int srcskip = info->s_skip >> 2;
1583 Uint32 *dstp = (Uint32 *)info->d_pixels;
1584 int dstskip = info->d_skip >> 2;
1585 SDL_PixelFormat* sf = info->src;
1586 Uint32 amask = sf->Amask;
1589 /* make mm6 all zeros. */
1590 "pxor %%mm6, %%mm6\n"
1592 /* Make a mask to preserve the alpha. */
1593 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1594 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1595 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1596 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1597 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1599 /* form channel masks */
1600 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1601 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1602 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1603 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1605 /* get alpha channel shift */
1606 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1608 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1618 : : "r" (srcp), "r" (dstp) );
1620 alpha = *srcp & amask;
1621 /* FIXME: Here we special-case opaque alpha since the
1622 compositioning used (>>8 instead of /255) doesn't handle
1623 it correctly. Also special-case alpha=0 for speed?
1628 else if(alpha == amask) {
1629 /* opaque alpha -- copy RGB, keep dst alpha */
1630 /* using MMX here to free up regular registers for other things */
1632 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1633 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1634 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1635 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1636 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1637 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1639 : : "r" (srcp), "r" (dstp) );
1644 /* load in the source, and dst. */
1645 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1646 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1648 /* Move the src alpha into mm2 */
1650 /* if supporting pshufw */
1651 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1652 /*"psrlw $8, %%mm2\n" */
1656 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1657 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1658 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1659 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1661 /* move the colors into words. */
1662 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1663 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1666 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1669 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1670 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1671 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1673 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1675 "movd %%mm0, (%1)\n" /* result in mm0 */
1677 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1691 /* End GCC_ASMBLIT*/
1694 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1695 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1697 int width = info->d_width;
1698 int height = info->d_height;
1699 Uint32 *srcp = (Uint32 *)info->s_pixels;
1700 int srcskip = info->s_skip >> 2;
1701 Uint32 *dstp = (Uint32 *)info->d_pixels;
1702 int dstskip = info->d_skip >> 2;
1703 SDL_PixelFormat* sf = info->src;
1704 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1705 Uint32 amask = sf->Amask;
1706 Uint32 ashift = sf->Ashift;
1709 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1711 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1712 multmask = ~(0xFFFFi64 << (ashift * 2));
1713 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1719 _m_prefetch(srcp + 16);
1720 _m_prefetch(dstp + 16);
1722 alpha = *srcp & amask;
1725 } else if (alpha == amask) {
1726 /* copy RGB, keep dst alpha */
1727 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1729 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1730 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1732 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1733 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1735 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1736 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1737 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1738 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1739 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1742 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1743 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1744 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1745 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1746 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1748 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1758 /* End MSVC_ASMBLIT */
1760 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1762 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1764 /* blend a single 16 bit pixel at 50% */
1765 #define BLEND16_50(d, s, mask) \
1766 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1768 /* blend two 16 bit pixels at 50% */
1769 #define BLEND2x16_50(d, s, mask) \
1770 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1771 + (s & d & (~(mask | mask << 16))))
1773 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1775 int width = info->d_width;
1776 int height = info->d_height;
1777 Uint16 *srcp = (Uint16 *)info->s_pixels;
1778 int srcskip = info->s_skip >> 1;
1779 Uint16 *dstp = (Uint16 *)info->d_pixels;
1780 int dstskip = info->d_skip >> 1;
1783 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1785 * Source and destination not aligned, pipeline it.
1786 * This is mostly a win for big blits but no loss for
1792 /* handle odd destination */
1793 if((uintptr_t)dstp & 2) {
1794 Uint16 d = *dstp, s = *srcp;
1795 *dstp = BLEND16_50(d, s, mask);
1800 srcp++; /* srcp is now 32-bit aligned */
1802 /* bootstrap pipeline with first halfword */
1803 prev_sw = ((Uint32 *)srcp)[-1];
1807 sw = *(Uint32 *)srcp;
1808 dw = *(Uint32 *)dstp;
1809 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1810 s = (prev_sw << 16) + (sw >> 16);
1812 s = (prev_sw >> 16) + (sw << 16);
1815 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1821 /* final pixel if any */
1823 Uint16 d = *dstp, s;
1824 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1825 s = (Uint16)prev_sw;
1827 s = (Uint16)(prev_sw >> 16);
1829 *dstp = BLEND16_50(d, s, mask);
1833 srcp += srcskip - 1;
1836 /* source and destination are aligned */
1839 /* first odd pixel? */
1840 if((uintptr_t)srcp & 2) {
1841 Uint16 d = *dstp, s = *srcp;
1842 *dstp = BLEND16_50(d, s, mask);
1847 /* srcp and dstp are now 32-bit aligned */
1850 Uint32 sw = *(Uint32 *)srcp;
1851 Uint32 dw = *(Uint32 *)dstp;
1852 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1858 /* last odd pixel? */
1860 Uint16 d = *dstp, s = *srcp;
1861 *dstp = BLEND16_50(d, s, mask);
1872 /* fast RGB565->RGB565 blending with surface alpha */
1873 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1875 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1877 Blit16to16SurfaceAlpha128(info, 0xf7de);
1879 int width = info->d_width;
1880 int height = info->d_height;
1881 Uint16 *srcp = (Uint16 *)info->s_pixels;
1882 int srcskip = info->s_skip >> 1;
1883 Uint16 *dstp = (Uint16 *)info->d_pixels;
1884 int dstskip = info->d_skip >> 1;
1888 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1890 alpha >>= 3; /* downscale alpha to 5 bits */
1892 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1893 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1894 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1895 /* position alpha to allow for mullo and mulhi on diff channels
1896 to reduce the number of operations */
1899 /* Setup the 565 color channel masks */
1900 load = 0x07E007E007E007E0ULL;
1901 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1902 load = 0x001F001F001F001FULL;
1903 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1910 * shift out the middle component (green) to
1911 * the high 16 bits, and process all three RGB
1912 * components at the same time.
1914 s = (s | s << 16) & 0x07e0f81f;
1915 d = (d | d << 16) & 0x07e0f81f;
1916 d += (s - d) * alpha >> 5;
1918 *dstp++ = d | d >> 16;
1923 * shift out the middle component (green) to
1924 * the high 16 bits, and process all three RGB
1925 * components at the same time.
1927 s = (s | s << 16) & 0x07e0f81f;
1928 d = (d | d << 16) & 0x07e0f81f;
1929 d += (s - d) * alpha >> 5;
1931 *dstp++ = d | d >> 16;
1935 * shift out the middle component (green) to
1936 * the high 16 bits, and process all three RGB
1937 * components at the same time.
1939 s = (s | s << 16) & 0x07e0f81f;
1940 d = (d | d << 16) & 0x07e0f81f;
1941 d += (s - d) * alpha >> 5;
1943 *dstp++ = d | d >> 16;
1945 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1946 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1948 /* red -- does not need a mask since the right shift clears
1949 the uninteresting bits */
1950 movq_r2r(mm2, mm5); /* src -> mm5 */
1951 movq_r2r(mm3, mm6); /* dst -> mm6 */
1952 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1953 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1956 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1957 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1958 /* alpha used is actually 11 bits
1959 11 + 5 = 16 bits, so the sign bits are lost */
1960 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1961 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1962 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1964 movq_r2r(mm6, mm1); /* save new reds in dsts */
1966 /* green -- process the bits in place */
1967 movq_r2r(mm2, mm5); /* src -> mm5 */
1968 movq_r2r(mm3, mm6); /* dst -> mm6 */
1969 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1970 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1973 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1974 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1975 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1976 bits are gone and the sign bits present */
1977 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1978 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1980 por_r2r(mm6, mm1); /* save new greens in dsts */
1983 movq_r2r(mm2, mm5); /* src -> mm5 */
1984 movq_r2r(mm3, mm6); /* dst -> mm6 */
1985 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1986 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1989 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1990 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1991 /* 11 + 5 = 16 bits, so the sign bits are lost and
1992 the interesting bits will need to be MASKed */
1993 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1994 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1995 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1997 por_r2r(mm6, mm1); /* save new blues in dsts */
1999 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2011 /* fast RGB555->RGB555 blending with surface alpha */
2012 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2014 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2016 Blit16to16SurfaceAlpha128(info, 0xfbde);
2018 int width = info->d_width;
2019 int height = info->d_height;
2020 Uint16 *srcp = (Uint16 *)info->s_pixels;
2021 int srcskip = info->s_skip >> 1;
2022 Uint16 *dstp = (Uint16 *)info->d_pixels;
2023 int dstskip = info->d_skip >> 1;
2027 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2029 alpha >>= 3; /* downscale alpha to 5 bits */
2031 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2032 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2033 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2034 /* position alpha to allow for mullo and mulhi on diff channels
2035 to reduce the number of operations */
2038 /* Setup the 555 color channel masks */
2039 load = 0x03E003E003E003E0ULL;
2040 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2041 load = 0x001F001F001F001FULL;
2042 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2049 * shift out the middle component (green) to
2050 * the high 16 bits, and process all three RGB
2051 * components at the same time.
2053 s = (s | s << 16) & 0x03e07c1f;
2054 d = (d | d << 16) & 0x03e07c1f;
2055 d += (s - d) * alpha >> 5;
2057 *dstp++ = d | d >> 16;
2062 * shift out the middle component (green) to
2063 * the high 16 bits, and process all three RGB
2064 * components at the same time.
2066 s = (s | s << 16) & 0x03e07c1f;
2067 d = (d | d << 16) & 0x03e07c1f;
2068 d += (s - d) * alpha >> 5;
2070 *dstp++ = d | d >> 16;
2074 * shift out the middle component (green) to
2075 * the high 16 bits, and process all three RGB
2076 * components at the same time.
2078 s = (s | s << 16) & 0x03e07c1f;
2079 d = (d | d << 16) & 0x03e07c1f;
2080 d += (s - d) * alpha >> 5;
2082 *dstp++ = d | d >> 16;
2084 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2085 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2087 /* red -- process the bits in place */
2088 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2089 /* by reusing the GREEN mask we free up another mmx
2090 register to accumulate the result */
2092 movq_r2r(mm2, mm5); /* src -> mm5 */
2093 movq_r2r(mm3, mm6); /* dst -> mm6 */
2094 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2095 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2098 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2099 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2100 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2101 cleared by a MASK below */
2102 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2103 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2104 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2106 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2108 movq_r2r(mm6, mm1); /* save new reds in dsts */
2110 /* green -- process the bits in place */
2111 movq_r2r(mm2, mm5); /* src -> mm5 */
2112 movq_r2r(mm3, mm6); /* dst -> mm6 */
2113 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2114 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2117 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2118 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2119 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2120 bits are gone and the sign bits present */
2121 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2122 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2124 por_r2r(mm6, mm1); /* save new greens in dsts */
2127 movq_r2r(mm2, mm5); /* src -> mm5 */
2128 movq_r2r(mm3, mm6); /* dst -> mm6 */
2129 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2130 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2133 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2134 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2135 /* 11 + 5 = 16 bits, so the sign bits are lost and
2136 the interesting bits will need to be MASKed */
2137 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2138 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2139 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2141 por_r2r(mm6, mm1); /* save new blues in dsts */
2143 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2154 /* End GCC_ASMBLIT */
2157 /* fast RGB565->RGB565 blending with surface alpha */
2158 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2160 unsigned alpha = info->src->alpha;
2162 Blit16to16SurfaceAlpha128(info, 0xf7de);
2164 int width = info->d_width;
2165 int height = info->d_height;
2166 Uint16 *srcp = (Uint16 *)info->s_pixels;
2167 int srcskip = info->s_skip >> 1;
2168 Uint16 *dstp = (Uint16 *)info->d_pixels;
2169 int dstskip = info->d_skip >> 1;
2172 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2174 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2175 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2176 alpha >>= 3; /* downscale alpha to 5 bits */
2178 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2179 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2180 /* position alpha to allow for mullo and mulhi on diff channels
2181 to reduce the number of operations */
2182 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2184 /* Setup the 565 color channel masks */
2185 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2186 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2194 * shift out the middle component (green) to
2195 * the high 16 bits, and process all three RGB
2196 * components at the same time.
2198 s = (s | s << 16) & 0x07e0f81f;
2199 d = (d | d << 16) & 0x07e0f81f;
2200 d += (s - d) * alpha >> 5;
2202 *dstp++ = (Uint16)(d | d >> 16);
2207 * shift out the middle component (green) to
2208 * the high 16 bits, and process all three RGB
2209 * components at the same time.
2211 s = (s | s << 16) & 0x07e0f81f;
2212 d = (d | d << 16) & 0x07e0f81f;
2213 d += (s - d) * alpha >> 5;
2215 *dstp++ = (Uint16)(d | d >> 16);
2219 * shift out the middle component (green) to
2220 * the high 16 bits, and process all three RGB
2221 * components at the same time.
2223 s = (s | s << 16) & 0x07e0f81f;
2224 d = (d | d << 16) & 0x07e0f81f;
2225 d += (s - d) * alpha >> 5;
2227 *dstp++ = (Uint16)(d | d >> 16);
2229 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2230 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2234 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2237 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2240 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2241 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2242 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2243 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2244 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2246 mm_res = dst2; /* RED -> mm_res */
2248 /* green -- process the bits in place */
2250 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2253 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2256 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2257 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2258 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2259 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2261 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2265 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2268 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2271 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2272 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2273 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2274 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2275 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2277 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2279 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2291 /* fast RGB555->RGB555 blending with surface alpha */
2292 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2294 unsigned alpha = info->src->alpha;
2296 Blit16to16SurfaceAlpha128(info, 0xfbde);
2298 int width = info->d_width;
2299 int height = info->d_height;
2300 Uint16 *srcp = (Uint16 *)info->s_pixels;
2301 int srcskip = info->s_skip >> 1;
2302 Uint16 *dstp = (Uint16 *)info->d_pixels;
2303 int dstskip = info->d_skip >> 1;
2306 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2308 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2309 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2310 alpha >>= 3; /* downscale alpha to 5 bits */
2312 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2313 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2314 /* position alpha to allow for mullo and mulhi on diff channels
2315 to reduce the number of operations */
2316 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2318 /* Setup the 555 color channel masks */
2319 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2320 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2321 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2329 * shift out the middle component (green) to
2330 * the high 16 bits, and process all three RGB
2331 * components at the same time.
2333 s = (s | s << 16) & 0x03e07c1f;
2334 d = (d | d << 16) & 0x03e07c1f;
2335 d += (s - d) * alpha >> 5;
2337 *dstp++ = (Uint16)(d | d >> 16);
2342 * shift out the middle component (green) to
2343 * the high 16 bits, and process all three RGB
2344 * components at the same time.
2346 s = (s | s << 16) & 0x03e07c1f;
2347 d = (d | d << 16) & 0x03e07c1f;
2348 d += (s - d) * alpha >> 5;
2350 *dstp++ = (Uint16)(d | d >> 16);
2354 * shift out the middle component (green) to
2355 * the high 16 bits, and process all three RGB
2356 * components at the same time.
2358 s = (s | s << 16) & 0x03e07c1f;
2359 d = (d | d << 16) & 0x03e07c1f;
2360 d += (s - d) * alpha >> 5;
2362 *dstp++ = (Uint16)(d | d >> 16);
2364 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2365 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2367 /* red -- process the bits in place */
2369 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2372 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2375 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2376 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2377 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2378 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2379 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2381 mm_res = dst2; /* RED -> mm_res */
2383 /* green -- process the bits in place */
2385 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2388 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2391 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2392 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2393 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2394 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2396 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2399 src2 = src1; /* src -> src2 */
2400 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2402 dst2 = dst1; /* dst -> dst2 */
2403 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2406 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2407 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2408 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2409 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2410 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2412 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2414 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2425 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2427 /* fast RGB565->RGB565 blending with surface alpha */
2428 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2430 unsigned alpha = info->src->alpha;
2432 Blit16to16SurfaceAlpha128(info, 0xf7de);
2434 int width = info->d_width;
2435 int height = info->d_height;
2436 Uint16 *srcp = (Uint16 *)info->s_pixels;
2437 int srcskip = info->s_skip >> 1;
2438 Uint16 *dstp = (Uint16 *)info->d_pixels;
2439 int dstskip = info->d_skip >> 1;
2440 alpha >>= 3; /* downscale alpha to 5 bits */
2447 * shift out the middle component (green) to
2448 * the high 16 bits, and process all three RGB
2449 * components at the same time.
2451 s = (s | s << 16) & 0x07e0f81f;
2452 d = (d | d << 16) & 0x07e0f81f;
2453 d += (s - d) * alpha >> 5;
2455 *dstp++ = (Uint16)(d | d >> 16);
2463 /* fast RGB555->RGB555 blending with surface alpha */
2464 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2466 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2468 Blit16to16SurfaceAlpha128(info, 0xfbde);
2470 int width = info->d_width;
2471 int height = info->d_height;
2472 Uint16 *srcp = (Uint16 *)info->s_pixels;
2473 int srcskip = info->s_skip >> 1;
2474 Uint16 *dstp = (Uint16 *)info->d_pixels;
2475 int dstskip = info->d_skip >> 1;
2476 alpha >>= 3; /* downscale alpha to 5 bits */
2483 * shift out the middle component (green) to
2484 * the high 16 bits, and process all three RGB
2485 * components at the same time.
2487 s = (s | s << 16) & 0x03e07c1f;
2488 d = (d | d << 16) & 0x03e07c1f;
2489 d += (s - d) * alpha >> 5;
2491 *dstp++ = (Uint16)(d | d >> 16);
2499 /* fast ARGB8888->RGB565 blending with pixel alpha */
2500 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2502 int width = info->d_width;
2503 int height = info->d_height;
2504 Uint32 *srcp = (Uint32 *)info->s_pixels;
2505 int srcskip = info->s_skip >> 2;
2506 Uint16 *dstp = (Uint16 *)info->d_pixels;
2507 int dstskip = info->d_skip >> 1;
2512 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2513 /* FIXME: Here we special-case opaque alpha since the
2514 compositioning used (>>8 instead of /255) doesn't handle
2515 it correctly. Also special-case alpha=0 for speed?
2518 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2519 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2523 * convert source and destination to G0RAB65565
2524 * and blend all components at the same time
2526 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2528 d = (d | d << 16) & 0x07e0f81f;
2529 d += (s - d) * alpha >> 5;
2531 *dstp = (Uint16)(d | d >> 16);
2542 /* fast ARGB8888->RGB555 blending with pixel alpha */
2543 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2545 int width = info->d_width;
2546 int height = info->d_height;
2547 Uint32 *srcp = (Uint32 *)info->s_pixels;
2548 int srcskip = info->s_skip >> 2;
2549 Uint16 *dstp = (Uint16 *)info->d_pixels;
2550 int dstskip = info->d_skip >> 1;
2556 alpha = s >> 27; /* downscale alpha to 5 bits */
2557 /* FIXME: Here we special-case opaque alpha since the
2558 compositioning used (>>8 instead of /255) doesn't handle
2559 it correctly. Also special-case alpha=0 for speed?
2562 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2563 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2567 * convert source and destination to G0RAB65565
2568 * and blend all components at the same time
2570 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2572 d = (d | d << 16) & 0x03e07c1f;
2573 d += (s - d) * alpha >> 5;
2575 *dstp = (Uint16)(d | d >> 16);
2586 /* General (slow) N->N blending with per-surface alpha */
2587 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2589 int width = info->d_width;
2590 int height = info->d_height;
2591 Uint8 *src = info->s_pixels;
2592 int srcskip = info->s_skip;
2593 Uint8 *dst = info->d_pixels;
2594 int dstskip = info->d_skip;
2595 SDL_PixelFormat *srcfmt = info->src;
2596 SDL_PixelFormat *dstfmt = info->dst;
2597 int srcbpp = srcfmt->BytesPerPixel;
2598 int dstbpp = dstfmt->BytesPerPixel;
2599 unsigned sA = srcfmt->alpha;
2600 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2603 while ( height-- ) {
2613 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2614 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2615 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2616 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2627 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2628 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2630 int width = info->d_width;
2631 int height = info->d_height;
2632 Uint8 *src = info->s_pixels;
2633 int srcskip = info->s_skip;
2634 Uint8 *dst = info->d_pixels;
2635 int dstskip = info->d_skip;
2636 SDL_PixelFormat *srcfmt = info->src;
2637 SDL_PixelFormat *dstfmt = info->dst;
2638 Uint32 ckey = srcfmt->colorkey;
2639 int srcbpp = srcfmt->BytesPerPixel;
2640 int dstbpp = dstfmt->BytesPerPixel;
2641 unsigned sA = srcfmt->alpha;
2642 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2644 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2645 Uint16 *src16 = (Uint16 *)src;
2646 Uint16 *dst16 = (Uint16 *)dst;
2647 sA >>= 3; /* downscale alpha to 5 bits */
2648 while ( height-- ) {
2654 if(sA && s != ckey) {
2656 s = (s | s << 16) & 0x07e0f81f;
2657 d = (d | d << 16) & 0x07e0f81f;
2658 d += (s - d) * sA >> 5;
2660 *dst16 = (Uint16)(d | d >> 16);
2666 src16 += srcskip / 2;
2667 dst16 += dstskip / 2;
2672 while ( height-- ) {
2682 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2683 if(sA && Pixel != ckey) {
2684 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2685 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2686 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2687 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2698 /* General (slow) N->N blending with pixel alpha */
2699 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2701 int width = info->d_width;
2702 int height = info->d_height;
2703 Uint8 *src = info->s_pixels;
2704 int srcskip = info->s_skip;
2705 Uint8 *dst = info->d_pixels;
2706 int dstskip = info->d_skip;
2707 SDL_PixelFormat *srcfmt = info->src;
2708 SDL_PixelFormat *dstfmt = info->dst;
2713 /* Set up some basic variables */
2714 srcbpp = srcfmt->BytesPerPixel;
2715 dstbpp = dstfmt->BytesPerPixel;
2717 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2718 quite right. for <8bpp source alpha, it gets them very wrong
2720 It is unclear whether there is a good general solution that doesn't
2721 need a branch (or a divide). */
2722 while ( height-- ) {
2734 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2736 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2737 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2738 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2750 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2752 SDL_PixelFormat *sf = surface->format;
2753 SDL_PixelFormat *df = surface->map->dst->format;
2755 if(sf->Amask == 0) {
2756 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2757 if(df->BytesPerPixel == 1)
2758 return BlitNto1SurfaceAlphaKey;
2760 #if SDL_ALTIVEC_BLITTERS
2761 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2762 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2763 return Blit32to32SurfaceAlphaKeyAltivec;
2766 return BlitNtoNSurfaceAlphaKey;
2768 /* Per-surface alpha blits */
2769 switch(df->BytesPerPixel) {
2771 return BlitNto1SurfaceAlpha;
2774 if(surface->map->identity) {
2775 if(df->Gmask == 0x7e0)
2779 return Blit565to565SurfaceAlphaMMX;
2782 return Blit565to565SurfaceAlpha;
2784 else if(df->Gmask == 0x3e0)
2788 return Blit555to555SurfaceAlphaMMX;
2791 return Blit555to555SurfaceAlpha;
2794 return BlitNtoNSurfaceAlpha;
2797 if(sf->Rmask == df->Rmask
2798 && sf->Gmask == df->Gmask
2799 && sf->Bmask == df->Bmask
2800 && sf->BytesPerPixel == 4)
2803 if(sf->Rshift % 8 == 0
2804 && sf->Gshift % 8 == 0
2805 && sf->Bshift % 8 == 0
2807 return BlitRGBtoRGBSurfaceAlphaMMX;
2809 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2811 #if SDL_ALTIVEC_BLITTERS
2812 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2813 && SDL_HasAltiVec())
2814 return BlitRGBtoRGBSurfaceAlphaAltivec;
2816 return BlitRGBtoRGBSurfaceAlpha;
2819 #if SDL_ALTIVEC_BLITTERS
2820 if((sf->BytesPerPixel == 4) &&
2821 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2822 return Blit32to32SurfaceAlphaAltivec;
2825 return BlitNtoNSurfaceAlpha;
2829 return BlitNtoNSurfaceAlpha;
2833 /* Per-pixel alpha blits */
2834 switch(df->BytesPerPixel) {
2836 return BlitNto1PixelAlpha;
2839 #if SDL_ALTIVEC_BLITTERS
2840 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2841 df->Gmask == 0x7e0 &&
2842 df->Bmask == 0x1f && SDL_HasAltiVec())
2843 return Blit32to565PixelAlphaAltivec;
2846 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2847 && sf->Gmask == 0xff00
2848 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2849 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2850 if(df->Gmask == 0x7e0)
2851 return BlitARGBto565PixelAlpha;
2852 else if(df->Gmask == 0x3e0)
2853 return BlitARGBto555PixelAlpha;
2855 return BlitNtoNPixelAlpha;
2858 if(sf->Rmask == df->Rmask
2859 && sf->Gmask == df->Gmask
2860 && sf->Bmask == df->Bmask
2861 && sf->BytesPerPixel == 4)
2864 if(sf->Rshift % 8 == 0
2865 && sf->Gshift % 8 == 0
2866 && sf->Bshift % 8 == 0
2867 && sf->Ashift % 8 == 0
2871 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2873 return BlitRGBtoRGBPixelAlphaMMX;
2876 if(sf->Amask == 0xff000000)
2878 #if SDL_ALTIVEC_BLITTERS
2879 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2880 && SDL_HasAltiVec())
2881 return BlitRGBtoRGBPixelAlphaAltivec;
2883 return BlitRGBtoRGBPixelAlpha;
2886 #if SDL_ALTIVEC_BLITTERS
2887 if (sf->Amask && sf->BytesPerPixel == 4 &&
2888 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2889 return Blit32to32PixelAlphaAltivec;
2892 return BlitNtoNPixelAlpha;
2896 return BlitNtoNPixelAlpha;