add some NEON 32bpp blitters
[sdl_omap.git] / src / video / SDL_blit_A.c
CommitLineData
e14743d1 1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35# define MMX_ASMBLIT 1
36# define GCC_ASMBLIT 1
37# elif defined(_MSC_VER) && defined(_M_IX86)
38# if (_MSC_VER <= 1200)
39# include <malloc.h>
40# if defined(_mm_free)
41# define HAVE_MMINTRIN_H 1
42# endif
43# else /* Visual Studio > VC6 always has mmintrin.h */
44# define HAVE_MMINTRIN_H 1
45# endif
46# if HAVE_MMINTRIN_H
47# define MMX_ASMBLIT 1
48# define MSVC_ASMBLIT 1
49# endif
50# endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
a1f34081 64#ifdef __ARM_NEON__
65
66/* NEON optimized blitter callers */
67#define make_neon_caller(name, neon_name) \
68extern void neon_name(void *dst, const void *src, int count); \
69static void name(SDL_BlitInfo *info) \
70{ \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int srcskip = info->s_skip; \
76 int dstskip = info->d_skip; \
77\
78 while ( height-- ) { \
79 neon_name(dst, src, width); \
80 src += width * 4 + srcskip; \
81 dst += width * 4 + dstskip; \
82 } \
83}
84
85make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
86make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
87
88#endif /* __ARM_NEON__ */
89
e14743d1 90/* N->1 blending with per-surface alpha */
91static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
92{
93 int width = info->d_width;
94 int height = info->d_height;
95 Uint8 *src = info->s_pixels;
96 int srcskip = info->s_skip;
97 Uint8 *dst = info->d_pixels;
98 int dstskip = info->d_skip;
99 Uint8 *palmap = info->table;
100 SDL_PixelFormat *srcfmt = info->src;
101 SDL_PixelFormat *dstfmt = info->dst;
102 int srcbpp = srcfmt->BytesPerPixel;
103
104 const unsigned A = srcfmt->alpha;
105
106 while ( height-- ) {
107 DUFFS_LOOP4(
108 {
109 Uint32 Pixel;
110 unsigned sR;
111 unsigned sG;
112 unsigned sB;
113 unsigned dR;
114 unsigned dG;
115 unsigned dB;
116 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
117 dR = dstfmt->palette->colors[*dst].r;
118 dG = dstfmt->palette->colors[*dst].g;
119 dB = dstfmt->palette->colors[*dst].b;
120 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
121 dR &= 0xff;
122 dG &= 0xff;
123 dB &= 0xff;
124 /* Pack RGB into 8bit pixel */
125 if ( palmap == NULL ) {
126 *dst =((dR>>5)<<(3+2))|
127 ((dG>>5)<<(2))|
128 ((dB>>6)<<(0));
129 } else {
130 *dst = palmap[((dR>>5)<<(3+2))|
131 ((dG>>5)<<(2)) |
132 ((dB>>6)<<(0))];
133 }
134 dst++;
135 src += srcbpp;
136 },
137 width);
138 src += srcskip;
139 dst += dstskip;
140 }
141}
142
143/* N->1 blending with pixel alpha */
144static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
145{
146 int width = info->d_width;
147 int height = info->d_height;
148 Uint8 *src = info->s_pixels;
149 int srcskip = info->s_skip;
150 Uint8 *dst = info->d_pixels;
151 int dstskip = info->d_skip;
152 Uint8 *palmap = info->table;
153 SDL_PixelFormat *srcfmt = info->src;
154 SDL_PixelFormat *dstfmt = info->dst;
155 int srcbpp = srcfmt->BytesPerPixel;
156
157 /* FIXME: fix alpha bit field expansion here too? */
158 while ( height-- ) {
159 DUFFS_LOOP4(
160 {
161 Uint32 Pixel;
162 unsigned sR;
163 unsigned sG;
164 unsigned sB;
165 unsigned sA;
166 unsigned dR;
167 unsigned dG;
168 unsigned dB;
169 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
170 dR = dstfmt->palette->colors[*dst].r;
171 dG = dstfmt->palette->colors[*dst].g;
172 dB = dstfmt->palette->colors[*dst].b;
173 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
174 dR &= 0xff;
175 dG &= 0xff;
176 dB &= 0xff;
177 /* Pack RGB into 8bit pixel */
178 if ( palmap == NULL ) {
179 *dst =((dR>>5)<<(3+2))|
180 ((dG>>5)<<(2))|
181 ((dB>>6)<<(0));
182 } else {
183 *dst = palmap[((dR>>5)<<(3+2))|
184 ((dG>>5)<<(2)) |
185 ((dB>>6)<<(0)) ];
186 }
187 dst++;
188 src += srcbpp;
189 },
190 width);
191 src += srcskip;
192 dst += dstskip;
193 }
194}
195
196/* colorkeyed N->1 blending with per-surface alpha */
197static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
198{
199 int width = info->d_width;
200 int height = info->d_height;
201 Uint8 *src = info->s_pixels;
202 int srcskip = info->s_skip;
203 Uint8 *dst = info->d_pixels;
204 int dstskip = info->d_skip;
205 Uint8 *palmap = info->table;
206 SDL_PixelFormat *srcfmt = info->src;
207 SDL_PixelFormat *dstfmt = info->dst;
208 int srcbpp = srcfmt->BytesPerPixel;
209 Uint32 ckey = srcfmt->colorkey;
210
211 const int A = srcfmt->alpha;
212
213 while ( height-- ) {
214 DUFFS_LOOP(
215 {
216 Uint32 Pixel;
217 unsigned sR;
218 unsigned sG;
219 unsigned sB;
220 unsigned dR;
221 unsigned dG;
222 unsigned dB;
223 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
224 if ( Pixel != ckey ) {
225 dR = dstfmt->palette->colors[*dst].r;
226 dG = dstfmt->palette->colors[*dst].g;
227 dB = dstfmt->palette->colors[*dst].b;
228 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
229 dR &= 0xff;
230 dG &= 0xff;
231 dB &= 0xff;
232 /* Pack RGB into 8bit pixel */
233 if ( palmap == NULL ) {
234 *dst =((dR>>5)<<(3+2))|
235 ((dG>>5)<<(2)) |
236 ((dB>>6)<<(0));
237 } else {
238 *dst = palmap[((dR>>5)<<(3+2))|
239 ((dG>>5)<<(2)) |
240 ((dB>>6)<<(0)) ];
241 }
242 }
243 dst++;
244 src += srcbpp;
245 },
246 width);
247 src += srcskip;
248 dst += dstskip;
249 }
250}
251
252#if GCC_ASMBLIT
253/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
254static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
255{
256 int width = info->d_width;
257 int height = info->d_height;
258 Uint32 *srcp = (Uint32 *)info->s_pixels;
259 int srcskip = info->s_skip >> 2;
260 Uint32 *dstp = (Uint32 *)info->d_pixels;
261 int dstskip = info->d_skip >> 2;
262 Uint32 dalpha = info->dst->Amask;
263 Uint64 load;
264
265 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
266 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
267 load = 0x0001010100010101ULL;/* !alpha128 mask */
268 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
269 movd_m2r(dalpha, mm7); /* dst alpha mask */
270 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
271 while(height--) {
272 DUFFS_LOOP_DOUBLE2(
273 {
274 Uint32 s = *srcp++;
275 Uint32 d = *dstp;
276 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
277 + (s & d & 0x00010101)) | dalpha;
278 },{
279 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
280 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
281
282 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
283 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
284
285 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
286 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
287 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
288 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
289 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
290 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
291 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
292
293 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
294 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
295 dstp += 2;
296 srcp += 2;
297 }, width);
298 srcp += srcskip;
299 dstp += dstskip;
300 }
301 emms();
302}
303
304/* fast RGB888->(A)RGB888 blending with surface alpha */
305static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
306{
307 SDL_PixelFormat* df = info->dst;
308 unsigned alpha = info->src->alpha;
309
310 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
311 /* only call a128 version when R,G,B occupy lower bits */
312 BlitRGBtoRGBSurfaceAlpha128MMX(info);
313 } else {
314 int width = info->d_width;
315 int height = info->d_height;
316 Uint32 *srcp = (Uint32 *)info->s_pixels;
317 int srcskip = info->s_skip >> 2;
318 Uint32 *dstp = (Uint32 *)info->d_pixels;
319 int dstskip = info->d_skip >> 2;
320
321 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
322 /* form the alpha mult */
323 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
324 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
325 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
326 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
327 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
328 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
329 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
330 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
331 movd_m2r(df->Amask, mm7); /* dst alpha mask */
332 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
333
334 while(height--) {
335 DUFFS_LOOP_DOUBLE2({
336 /* One Pixel Blend */
337 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
338 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
339 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
340 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
341
342 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
343 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
344 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
345 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
346
347 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
348 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
349 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
350 ++srcp;
351 ++dstp;
352 },{
353 /* Two Pixels Blend */
354 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
355 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
356 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
357 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
358
359 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
360 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
361 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
362 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
363
364 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
365 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
366 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
367 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
368
369 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
370 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
371 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
372 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
373
374 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
375 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
376
377 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
378
379 srcp += 2;
380 dstp += 2;
381 }, width);
382 srcp += srcskip;
383 dstp += dstskip;
384 }
385 emms();
386 }
387}
388
389/* fast ARGB888->(A)RGB888 blending with pixel alpha */
390static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
391{
392 int width = info->d_width;
393 int height = info->d_height;
394 Uint32 *srcp = (Uint32 *)info->s_pixels;
395 int srcskip = info->s_skip >> 2;
396 Uint32 *dstp = (Uint32 *)info->d_pixels;
397 int dstskip = info->d_skip >> 2;
398 SDL_PixelFormat* sf = info->src;
399 Uint32 amask = sf->Amask;
400
401 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
402 /* form multiplication mask */
403 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
404 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
405 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
406 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
407 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
408 /* form channel masks */
409 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
410 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
411 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
412 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
413 /* get alpha channel shift */
414 __asm__ __volatile__ (
415 "movd %0, %%mm5"
416 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
417
418 while(height--) {
419 DUFFS_LOOP4({
420 Uint32 alpha = *srcp & amask;
421 /* FIXME: Here we special-case opaque alpha since the
422 compositioning used (>>8 instead of /255) doesn't handle
423 it correctly. Also special-case alpha=0 for speed?
424 Benchmark this! */
425 if(alpha == 0) {
426 /* do nothing */
427 } else if(alpha == amask) {
428 /* opaque alpha -- copy RGB, keep dst alpha */
429 /* using MMX here to free up regular registers for other things */
430 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
431 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
432 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
433 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
434 por_r2r(mm1, mm2); /* src | dst -> mm2 */
435 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
436 } else {
437 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
438 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
439
440 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
441 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
442
443 __asm__ __volatile__ (
444 "movd %0, %%mm4"
445 : : "r" (alpha) ); /* 0000A000 -> mm4 */
446 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
447 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
448 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
449 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
450
451 /* blend */
452 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
453 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
454 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
455 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
456
457 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
458 movd_r2m(mm2, *dstp);/* mm2 -> dst */
459 }
460 ++srcp;
461 ++dstp;
462 }, width);
463 srcp += srcskip;
464 dstp += dstskip;
465 }
466 emms();
467}
468/* End GCC_ASMBLIT */
469
470#elif MSVC_ASMBLIT
471/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
472static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
473{
474 int width = info->d_width;
475 int height = info->d_height;
476 Uint32 *srcp = (Uint32 *)info->s_pixels;
477 int srcskip = info->s_skip >> 2;
478 Uint32 *dstp = (Uint32 *)info->d_pixels;
479 int dstskip = info->d_skip >> 2;
480 Uint32 dalpha = info->dst->Amask;
481
482 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
483
484 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
485 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
486 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
487
488 while (height--) {
489 int n = width;
490 if ( n & 1 ) {
491 Uint32 s = *srcp++;
492 Uint32 d = *dstp;
493 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
494 + (s & d & 0x00010101)) | dalpha;
495 n--;
496 }
497
498 for (n >>= 1; n > 0; --n) {
499 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
500 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
501
502 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
503 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
504
505 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
506 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
507 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
508 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
509
510 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
511 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
512 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
513 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
514
515 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
516 dstp += 2;
517 srcp += 2;
518 }
519
520 srcp += srcskip;
521 dstp += dstskip;
522 }
523 _mm_empty();
524}
525
526/* fast RGB888->(A)RGB888 blending with surface alpha */
527static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
528{
529 SDL_PixelFormat* df = info->dst;
530 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
531 unsigned alpha = info->src->alpha;
532
533 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
534 /* only call a128 version when R,G,B occupy lower bits */
535 BlitRGBtoRGBSurfaceAlpha128MMX(info);
536 } else {
537 int width = info->d_width;
538 int height = info->d_height;
539 Uint32 *srcp = (Uint32 *)info->s_pixels;
540 int srcskip = info->s_skip >> 2;
541 Uint32 *dstp = (Uint32 *)info->d_pixels;
542 int dstskip = info->d_skip >> 2;
543 Uint32 dalpha = df->Amask;
544 Uint32 amult;
545
546 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
547
548 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
549 /* form the alpha mult */
550 amult = alpha | (alpha << 8);
551 amult = amult | (amult << 16);
552 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
553 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
554 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
555 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
556 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
557
558 while (height--) {
559 int n = width;
560 if (n & 1) {
561 /* One Pixel Blend */
562 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
563 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
564
565 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
566 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
567
568 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
569 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
570 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
571 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
572
573 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
574 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
575 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
576
577 ++srcp;
578 ++dstp;
579
580 n--;
581 }
582
583 for (n >>= 1; n > 0; --n) {
584 /* Two Pixels Blend */
585 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
586 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
587 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
588 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
589
590 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
591 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
592 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
593 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
594
595 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
596 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
597 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
598 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
599
600 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
601 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
602 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
603 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
604
605 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
606 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
607
608 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
609
610 srcp += 2;
611 dstp += 2;
612 }
613 srcp += srcskip;
614 dstp += dstskip;
615 }
616 _mm_empty();
617 }
618}
619
620/* fast ARGB888->(A)RGB888 blending with pixel alpha */
621static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
622{
623 int width = info->d_width;
624 int height = info->d_height;
625 Uint32 *srcp = (Uint32 *)info->s_pixels;
626 int srcskip = info->s_skip >> 2;
627 Uint32 *dstp = (Uint32 *)info->d_pixels;
628 int dstskip = info->d_skip >> 2;
629 SDL_PixelFormat* sf = info->src;
630 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
631 Uint32 amask = sf->Amask;
632 Uint32 ashift = sf->Ashift;
633 Uint64 multmask;
634
635 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
636
637 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
638 multmask = ~(0xFFFFi64 << (ashift * 2));
639 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
640
641 while(height--) {
642 DUFFS_LOOP4({
643 Uint32 alpha = *srcp & amask;
644 if (alpha == 0) {
645 /* do nothing */
646 } else if (alpha == amask) {
647 /* opaque alpha -- copy RGB, keep dst alpha */
648 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
649 } else {
650 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
651 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
652
653 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
654 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
655
656 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
657 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
658 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
659 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
660 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
661
662 /* blend */
663 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
664 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
665 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
666 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
667 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
668
669 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
670 }
671 ++srcp;
672 ++dstp;
673 }, width);
674 srcp += srcskip;
675 dstp += dstskip;
676 }
677 _mm_empty();
678}
679/* End MSVC_ASMBLIT */
680
681#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
682
683#if SDL_ALTIVEC_BLITTERS
684#if __MWERKS__
685#pragma altivec_model on
686#endif
687#if HAVE_ALTIVEC_H
688#include <altivec.h>
689#endif
690#include <assert.h>
691
692#if (defined(__MACOSX__) && (__GNUC__ < 4))
693 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
694 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
695 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
696 (vector unsigned short) ( a,b,c,d,e,f,g,h )
697#else
698 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
699 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
700 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
701 (vector unsigned short) { a,b,c,d,e,f,g,h }
702#endif
703
704#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
705#define VECPRINT(msg, v) do { \
706 vector unsigned int tmpvec = (vector unsigned int)(v); \
707 unsigned int *vp = (unsigned int *)&tmpvec; \
708 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
709} while (0)
710
711/* the permuation vector that takes the high bytes out of all the appropriate shorts
712 (vector unsigned char)(
713 0x00, 0x10, 0x02, 0x12,
714 0x04, 0x14, 0x06, 0x16,
715 0x08, 0x18, 0x0A, 0x1A,
716 0x0C, 0x1C, 0x0E, 0x1E );
717*/
718#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
719#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
720#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
721#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
722 ? vec_lvsl(0, src) \
723 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
724
725
726#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
727 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
728 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
729 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
730 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
731 /* valpha2 is 255-alpha */ \
732 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
733 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
734 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
735 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
736 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
737 /* add source and dest */ \
738 vtemp1 = vec_add(vtemp1, vtemp3); \
739 vtemp2 = vec_add(vtemp2, vtemp4); \
740 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
741 vtemp1 = vec_add(vtemp1, v1_16); \
742 vtemp3 = vec_sr(vtemp1, v8_16); \
743 vtemp1 = vec_add(vtemp1, vtemp3); \
744 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
745 vtemp2 = vec_add(vtemp2, v1_16); \
746 vtemp4 = vec_sr(vtemp2, v8_16); \
747 vtemp2 = vec_add(vtemp2, vtemp4); \
748 /* (>>8) and get ARGBARGBARGBARGB */ \
749 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
750} while (0)
751
752/* Calculate the permute vector used for 32->32 swizzling */
753static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
754 const SDL_PixelFormat *dstfmt)
755{
756 /*
757 * We have to assume that the bits that aren't used by other
758 * colors is alpha, and it's one complete byte, since some formats
759 * leave alpha with a zero mask, but we should still swizzle the bits.
760 */
761 /* ARGB */
762 const static struct SDL_PixelFormat default_pixel_format = {
763 NULL, 0, 0,
764 0, 0, 0, 0,
765 16, 8, 0, 24,
766 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
767 0, 0};
768 if (!srcfmt) {
769 srcfmt = &default_pixel_format;
770 }
771 if (!dstfmt) {
772 dstfmt = &default_pixel_format;
773 }
774 const vector unsigned char plus = VECUINT8_LITERAL
775 ( 0x00, 0x00, 0x00, 0x00,
776 0x04, 0x04, 0x04, 0x04,
777 0x08, 0x08, 0x08, 0x08,
778 0x0C, 0x0C, 0x0C, 0x0C );
779 vector unsigned char vswiz;
780 vector unsigned int srcvec;
781#define RESHIFT(X) (3 - ((X) >> 3))
782 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
783 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
784 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
785 Uint32 amask;
786 /* Use zero for alpha if either surface doesn't have alpha */
787 if (dstfmt->Amask) {
788 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
789 } else {
790 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
791 }
792#undef RESHIFT
793 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
794 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
795 return(vswiz);
796}
797
798static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
799{
800 int height = info->d_height;
801 Uint8 *src = (Uint8 *)info->s_pixels;
802 int srcskip = info->s_skip;
803 Uint8 *dst = (Uint8 *)info->d_pixels;
804 int dstskip = info->d_skip;
805 SDL_PixelFormat *srcfmt = info->src;
806
807 vector unsigned char v0 = vec_splat_u8(0);
808 vector unsigned short v8_16 = vec_splat_u16(8);
809 vector unsigned short v1_16 = vec_splat_u16(1);
810 vector unsigned short v2_16 = vec_splat_u16(2);
811 vector unsigned short v3_16 = vec_splat_u16(3);
812 vector unsigned int v8_32 = vec_splat_u32(8);
813 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
814 vector unsigned short v3f = VECUINT16_LITERAL(
815 0x003f, 0x003f, 0x003f, 0x003f,
816 0x003f, 0x003f, 0x003f, 0x003f);
817 vector unsigned short vfc = VECUINT16_LITERAL(
818 0x00fc, 0x00fc, 0x00fc, 0x00fc,
819 0x00fc, 0x00fc, 0x00fc, 0x00fc);
820
821 /*
822 0x10 - 0x1f is the alpha
823 0x00 - 0x0e evens are the red
824 0x01 - 0x0f odds are zero
825 */
826 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
827 0x10, 0x00, 0x01, 0x01,
828 0x10, 0x02, 0x01, 0x01,
829 0x10, 0x04, 0x01, 0x01,
830 0x10, 0x06, 0x01, 0x01
831 );
832 vector unsigned char vredalpha2 = (vector unsigned char)(
833 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
834 );
835 /*
836 0x00 - 0x0f is ARxx ARxx ARxx ARxx
837 0x11 - 0x0f odds are blue
838 */
839 vector unsigned char vblue1 = VECUINT8_LITERAL(
840 0x00, 0x01, 0x02, 0x11,
841 0x04, 0x05, 0x06, 0x13,
842 0x08, 0x09, 0x0a, 0x15,
843 0x0c, 0x0d, 0x0e, 0x17
844 );
845 vector unsigned char vblue2 = (vector unsigned char)(
846 vec_add((vector unsigned int)vblue1, v8_32)
847 );
848 /*
849 0x00 - 0x0f is ARxB ARxB ARxB ARxB
850 0x10 - 0x0e evens are green
851 */
852 vector unsigned char vgreen1 = VECUINT8_LITERAL(
853 0x00, 0x01, 0x10, 0x03,
854 0x04, 0x05, 0x12, 0x07,
855 0x08, 0x09, 0x14, 0x0b,
856 0x0c, 0x0d, 0x16, 0x0f
857 );
858 vector unsigned char vgreen2 = (vector unsigned char)(
859 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
860 );
861 vector unsigned char vgmerge = VECUINT8_LITERAL(
862 0x00, 0x02, 0x00, 0x06,
863 0x00, 0x0a, 0x00, 0x0e,
864 0x00, 0x12, 0x00, 0x16,
865 0x00, 0x1a, 0x00, 0x1e);
866 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
867 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
868 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
869
870 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
871 vf800 = vec_sl(vf800, vec_splat_u16(8));
872
873 while(height--) {
874 int extrawidth;
875 vector unsigned char valigner;
876 vector unsigned char vsrc;
877 vector unsigned char voverflow;
878 int width = info->d_width;
879
880#define ONE_PIXEL_BLEND(condition, widthvar) \
881 while (condition) { \
882 Uint32 Pixel; \
883 unsigned sR, sG, sB, dR, dG, dB, sA; \
884 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
885 if(sA) { \
886 unsigned short dstpixel = *((unsigned short *)dst); \
887 dR = (dstpixel >> 8) & 0xf8; \
888 dG = (dstpixel >> 3) & 0xfc; \
889 dB = (dstpixel << 3) & 0xf8; \
890 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
891 *((unsigned short *)dst) = ( \
892 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
893 ); \
894 } \
895 src += 4; \
896 dst += 2; \
897 widthvar--; \
898 }
899 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
900 extrawidth = (width % 8);
901 valigner = VEC_ALIGNER(src);
902 vsrc = (vector unsigned char)vec_ld(0, src);
903 width -= extrawidth;
904 while (width) {
905 vector unsigned char valpha;
906 vector unsigned char vsrc1, vsrc2;
907 vector unsigned char vdst1, vdst2;
908 vector unsigned short vR, vG, vB;
909 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
910
911 /* Load 8 pixels from src as ARGB */
912 voverflow = (vector unsigned char)vec_ld(15, src);
913 vsrc = vec_perm(vsrc, voverflow, valigner);
914 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
915 src += 16;
916 vsrc = (vector unsigned char)vec_ld(15, src);
917 voverflow = vec_perm(voverflow, vsrc, valigner);
918 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
919 src += 16;
920
921 /* Load 8 pixels from dst as XRGB */
922 voverflow = vec_ld(0, dst);
923 vR = vec_and((vector unsigned short)voverflow, vf800);
924 vB = vec_sl((vector unsigned short)voverflow, v3_16);
925 vG = vec_sl(vB, v2_16);
926 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
927 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
928 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
929 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
930 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
931 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
932
933 /* Alpha blend 8 pixels as ARGB */
934 valpha = vec_perm(vsrc1, v0, valphaPermute);
935 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
936 valpha = vec_perm(vsrc2, v0, valphaPermute);
937 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
938
939 /* Convert 8 pixels to 565 */
940 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
941 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
942 vgpixel = vec_and(vgpixel, vfc);
943 vgpixel = vec_sl(vgpixel, v3_16);
944 vrpixel = vec_sl(vpixel, v1_16);
945 vrpixel = vec_and(vrpixel, vf800);
946 vbpixel = vec_and(vpixel, v3f);
947 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
948 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
949
950 /* Store 8 pixels */
951 vec_st(vdst1, 0, dst);
952
953 width -= 8;
954 dst += 16;
955 }
956 ONE_PIXEL_BLEND((extrawidth), extrawidth);
957#undef ONE_PIXEL_BLEND
958 src += srcskip;
959 dst += dstskip;
960 }
961}
962
963static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
964{
965 unsigned alpha = info->src->alpha;
966 int height = info->d_height;
967 Uint32 *srcp = (Uint32 *)info->s_pixels;
968 int srcskip = info->s_skip >> 2;
969 Uint32 *dstp = (Uint32 *)info->d_pixels;
970 int dstskip = info->d_skip >> 2;
971 SDL_PixelFormat *srcfmt = info->src;
972 SDL_PixelFormat *dstfmt = info->dst;
973 unsigned sA = srcfmt->alpha;
974 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
975 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
976 Uint32 ckey = info->src->colorkey;
977 vector unsigned char mergePermute;
978 vector unsigned char vsrcPermute;
979 vector unsigned char vdstPermute;
980 vector unsigned char vsdstPermute;
981 vector unsigned char valpha;
982 vector unsigned char valphamask;
983 vector unsigned char vbits;
984 vector unsigned char v0;
985 vector unsigned short v1;
986 vector unsigned short v8;
987 vector unsigned int vckey;
988 vector unsigned int vrgbmask;
989
990 mergePermute = VEC_MERGE_PERMUTE();
991 v0 = vec_splat_u8(0);
992 v1 = vec_splat_u16(1);
993 v8 = vec_splat_u16(8);
994
995 /* set the alpha to 255 on the destination surf */
996 valphamask = VEC_ALPHA_MASK();
997
998 vsrcPermute = calc_swizzle32(srcfmt, NULL);
999 vdstPermute = calc_swizzle32(NULL, dstfmt);
1000 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1001
1002 /* set a vector full of alpha and 255-alpha */
1003 ((unsigned char *)&valpha)[0] = alpha;
1004 valpha = vec_splat(valpha, 0);
1005 vbits = (vector unsigned char)vec_splat_s8(-1);
1006
1007 ckey &= rgbmask;
1008 ((unsigned int *)(char*)&vckey)[0] = ckey;
1009 vckey = vec_splat(vckey, 0);
1010 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1011 vrgbmask = vec_splat(vrgbmask, 0);
1012
1013 while(height--) {
1014 int width = info->d_width;
1015#define ONE_PIXEL_BLEND(condition, widthvar) \
1016 while (condition) { \
1017 Uint32 Pixel; \
1018 unsigned sR, sG, sB, dR, dG, dB; \
1019 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1020 if(sA && Pixel != ckey) { \
1021 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1022 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1023 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1024 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1025 } \
1026 dstp++; \
1027 srcp++; \
1028 widthvar--; \
1029 }
1030 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1031 if (width > 0) {
1032 int extrawidth = (width % 4);
1033 vector unsigned char valigner = VEC_ALIGNER(srcp);
1034 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1035 width -= extrawidth;
1036 while (width) {
1037 vector unsigned char vsel;
1038 vector unsigned char voverflow;
1039 vector unsigned char vd;
1040 vector unsigned char vd_orig;
1041
1042 /* s = *srcp */
1043 voverflow = (vector unsigned char)vec_ld(15, srcp);
1044 vs = vec_perm(vs, voverflow, valigner);
1045
1046 /* vsel is set for items that match the key */
1047 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1048 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1049
1050 /* permute to source format */
1051 vs = vec_perm(vs, valpha, vsrcPermute);
1052
1053 /* d = *dstp */
1054 vd = (vector unsigned char)vec_ld(0, dstp);
1055 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1056
1057 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1058
1059 /* set the alpha channel to full on */
1060 vd = vec_or(vd, valphamask);
1061
1062 /* mask out color key */
1063 vd = vec_sel(vd, vd_orig, vsel);
1064
1065 /* permute to dest format */
1066 vd = vec_perm(vd, vbits, vdstPermute);
1067
1068 /* *dstp = res */
1069 vec_st((vector unsigned int)vd, 0, dstp);
1070
1071 srcp += 4;
1072 dstp += 4;
1073 width -= 4;
1074 vs = voverflow;
1075 }
1076 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1077 }
1078#undef ONE_PIXEL_BLEND
1079
1080 srcp += srcskip;
1081 dstp += dstskip;
1082 }
1083}
1084
1085
1086static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1087{
1088 int width = info->d_width;
1089 int height = info->d_height;
1090 Uint32 *srcp = (Uint32 *)info->s_pixels;
1091 int srcskip = info->s_skip >> 2;
1092 Uint32 *dstp = (Uint32 *)info->d_pixels;
1093 int dstskip = info->d_skip >> 2;
1094 SDL_PixelFormat *srcfmt = info->src;
1095 SDL_PixelFormat *dstfmt = info->dst;
1096 vector unsigned char mergePermute;
1097 vector unsigned char valphaPermute;
1098 vector unsigned char vsrcPermute;
1099 vector unsigned char vdstPermute;
1100 vector unsigned char vsdstPermute;
1101 vector unsigned char valphamask;
1102 vector unsigned char vpixelmask;
1103 vector unsigned char v0;
1104 vector unsigned short v1;
1105 vector unsigned short v8;
1106
1107 v0 = vec_splat_u8(0);
1108 v1 = vec_splat_u16(1);
1109 v8 = vec_splat_u16(8);
1110 mergePermute = VEC_MERGE_PERMUTE();
1111 valphamask = VEC_ALPHA_MASK();
1112 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1113 vpixelmask = vec_nor(valphamask, v0);
1114 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1115 vdstPermute = calc_swizzle32(NULL, dstfmt);
1116 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1117
1118 while ( height-- ) {
1119 width = info->d_width;
1120#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1121 Uint32 Pixel; \
1122 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1123 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1124 if(sA) { \
1125 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1126 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1127 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1128 } \
1129 ++srcp; \
1130 ++dstp; \
1131 widthvar--; \
1132 }
1133 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1134 if (width > 0) {
1135 /* vsrcPermute */
1136 /* vdstPermute */
1137 int extrawidth = (width % 4);
1138 vector unsigned char valigner = VEC_ALIGNER(srcp);
1139 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1140 width -= extrawidth;
1141 while (width) {
1142 vector unsigned char voverflow;
1143 vector unsigned char vd;
1144 vector unsigned char valpha;
1145 vector unsigned char vdstalpha;
1146 /* s = *srcp */
1147 voverflow = (vector unsigned char)vec_ld(15, srcp);
1148 vs = vec_perm(vs, voverflow, valigner);
1149 vs = vec_perm(vs, v0, vsrcPermute);
1150
1151 valpha = vec_perm(vs, v0, valphaPermute);
1152
1153 /* d = *dstp */
1154 vd = (vector unsigned char)vec_ld(0, dstp);
1155 vd = vec_perm(vd, v0, vsdstPermute);
1156 vdstalpha = vec_and(vd, valphamask);
1157
1158 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1159
1160 /* set the alpha to the dest alpha */
1161 vd = vec_and(vd, vpixelmask);
1162 vd = vec_or(vd, vdstalpha);
1163 vd = vec_perm(vd, v0, vdstPermute);
1164
1165 /* *dstp = res */
1166 vec_st((vector unsigned int)vd, 0, dstp);
1167
1168 srcp += 4;
1169 dstp += 4;
1170 width -= 4;
1171 vs = voverflow;
1172
1173 }
1174 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1175 }
1176 srcp += srcskip;
1177 dstp += dstskip;
1178#undef ONE_PIXEL_BLEND
1179 }
1180}
1181
1182/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1183static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1184{
1185 int width = info->d_width;
1186 int height = info->d_height;
1187 Uint32 *srcp = (Uint32 *)info->s_pixels;
1188 int srcskip = info->s_skip >> 2;
1189 Uint32 *dstp = (Uint32 *)info->d_pixels;
1190 int dstskip = info->d_skip >> 2;
1191 vector unsigned char mergePermute;
1192 vector unsigned char valphaPermute;
1193 vector unsigned char valphamask;
1194 vector unsigned char vpixelmask;
1195 vector unsigned char v0;
1196 vector unsigned short v1;
1197 vector unsigned short v8;
1198 v0 = vec_splat_u8(0);
1199 v1 = vec_splat_u16(1);
1200 v8 = vec_splat_u16(8);
1201 mergePermute = VEC_MERGE_PERMUTE();
1202 valphamask = VEC_ALPHA_MASK();
1203 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1204
1205
1206 vpixelmask = vec_nor(valphamask, v0);
1207 while(height--) {
1208 width = info->d_width;
1209#define ONE_PIXEL_BLEND(condition, widthvar) \
1210 while ((condition)) { \
1211 Uint32 dalpha; \
1212 Uint32 d; \
1213 Uint32 s1; \
1214 Uint32 d1; \
1215 Uint32 s = *srcp; \
1216 Uint32 alpha = s >> 24; \
1217 if(alpha) { \
1218 if(alpha == SDL_ALPHA_OPAQUE) { \
1219 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1220 } else { \
1221 d = *dstp; \
1222 dalpha = d & 0xff000000; \
1223 s1 = s & 0xff00ff; \
1224 d1 = d & 0xff00ff; \
1225 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1226 s &= 0xff00; \
1227 d &= 0xff00; \
1228 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1229 *dstp = d1 | d | dalpha; \
1230 } \
1231 } \
1232 ++srcp; \
1233 ++dstp; \
1234 widthvar--; \
1235 }
1236 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1237 if (width > 0) {
1238 int extrawidth = (width % 4);
1239 vector unsigned char valigner = VEC_ALIGNER(srcp);
1240 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1241 width -= extrawidth;
1242 while (width) {
1243 vector unsigned char voverflow;
1244 vector unsigned char vd;
1245 vector unsigned char valpha;
1246 vector unsigned char vdstalpha;
1247 /* s = *srcp */
1248 voverflow = (vector unsigned char)vec_ld(15, srcp);
1249 vs = vec_perm(vs, voverflow, valigner);
1250
1251 valpha = vec_perm(vs, v0, valphaPermute);
1252
1253 /* d = *dstp */
1254 vd = (vector unsigned char)vec_ld(0, dstp);
1255 vdstalpha = vec_and(vd, valphamask);
1256
1257 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1258
1259 /* set the alpha to the dest alpha */
1260 vd = vec_and(vd, vpixelmask);
1261 vd = vec_or(vd, vdstalpha);
1262
1263 /* *dstp = res */
1264 vec_st((vector unsigned int)vd, 0, dstp);
1265
1266 srcp += 4;
1267 dstp += 4;
1268 width -= 4;
1269 vs = voverflow;
1270 }
1271 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1272 }
1273 srcp += srcskip;
1274 dstp += dstskip;
1275 }
1276#undef ONE_PIXEL_BLEND
1277}
1278
1279static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1280{
1281 /* XXX : 6 */
1282 unsigned alpha = info->src->alpha;
1283 int height = info->d_height;
1284 Uint32 *srcp = (Uint32 *)info->s_pixels;
1285 int srcskip = info->s_skip >> 2;
1286 Uint32 *dstp = (Uint32 *)info->d_pixels;
1287 int dstskip = info->d_skip >> 2;
1288 SDL_PixelFormat *srcfmt = info->src;
1289 SDL_PixelFormat *dstfmt = info->dst;
1290 unsigned sA = srcfmt->alpha;
1291 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1292 vector unsigned char mergePermute;
1293 vector unsigned char vsrcPermute;
1294 vector unsigned char vdstPermute;
1295 vector unsigned char vsdstPermute;
1296 vector unsigned char valpha;
1297 vector unsigned char valphamask;
1298 vector unsigned char vbits;
1299 vector unsigned short v1;
1300 vector unsigned short v8;
1301
1302 mergePermute = VEC_MERGE_PERMUTE();
1303 v1 = vec_splat_u16(1);
1304 v8 = vec_splat_u16(8);
1305
1306 /* set the alpha to 255 on the destination surf */
1307 valphamask = VEC_ALPHA_MASK();
1308
1309 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1310 vdstPermute = calc_swizzle32(NULL, dstfmt);
1311 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1312
1313 /* set a vector full of alpha and 255-alpha */
1314 ((unsigned char *)&valpha)[0] = alpha;
1315 valpha = vec_splat(valpha, 0);
1316 vbits = (vector unsigned char)vec_splat_s8(-1);
1317
1318 while(height--) {
1319 int width = info->d_width;
1320#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1321 Uint32 Pixel; \
1322 unsigned sR, sG, sB, dR, dG, dB; \
1323 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1324 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1325 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1326 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1327 ++srcp; \
1328 ++dstp; \
1329 widthvar--; \
1330 }
1331 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1332 if (width > 0) {
1333 int extrawidth = (width % 4);
1334 vector unsigned char valigner = VEC_ALIGNER(srcp);
1335 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1336 width -= extrawidth;
1337 while (width) {
1338 vector unsigned char voverflow;
1339 vector unsigned char vd;
1340
1341 /* s = *srcp */
1342 voverflow = (vector unsigned char)vec_ld(15, srcp);
1343 vs = vec_perm(vs, voverflow, valigner);
1344 vs = vec_perm(vs, valpha, vsrcPermute);
1345
1346 /* d = *dstp */
1347 vd = (vector unsigned char)vec_ld(0, dstp);
1348 vd = vec_perm(vd, vd, vsdstPermute);
1349
1350 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1351
1352 /* set the alpha channel to full on */
1353 vd = vec_or(vd, valphamask);
1354 vd = vec_perm(vd, vbits, vdstPermute);
1355
1356 /* *dstp = res */
1357 vec_st((vector unsigned int)vd, 0, dstp);
1358
1359 srcp += 4;
1360 dstp += 4;
1361 width -= 4;
1362 vs = voverflow;
1363 }
1364 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1365 }
1366#undef ONE_PIXEL_BLEND
1367
1368 srcp += srcskip;
1369 dstp += dstskip;
1370 }
1371
1372}
1373
1374
1375/* fast RGB888->(A)RGB888 blending */
1376static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1377{
1378 unsigned alpha = info->src->alpha;
1379 int height = info->d_height;
1380 Uint32 *srcp = (Uint32 *)info->s_pixels;
1381 int srcskip = info->s_skip >> 2;
1382 Uint32 *dstp = (Uint32 *)info->d_pixels;
1383 int dstskip = info->d_skip >> 2;
1384 vector unsigned char mergePermute;
1385 vector unsigned char valpha;
1386 vector unsigned char valphamask;
1387 vector unsigned short v1;
1388 vector unsigned short v8;
1389
1390 mergePermute = VEC_MERGE_PERMUTE();
1391 v1 = vec_splat_u16(1);
1392 v8 = vec_splat_u16(8);
1393
1394 /* set the alpha to 255 on the destination surf */
1395 valphamask = VEC_ALPHA_MASK();
1396
1397 /* set a vector full of alpha and 255-alpha */
1398 ((unsigned char *)&valpha)[0] = alpha;
1399 valpha = vec_splat(valpha, 0);
1400
1401 while(height--) {
1402 int width = info->d_width;
1403#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1404 Uint32 s = *srcp; \
1405 Uint32 d = *dstp; \
1406 Uint32 s1 = s & 0xff00ff; \
1407 Uint32 d1 = d & 0xff00ff; \
1408 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1409 & 0xff00ff; \
1410 s &= 0xff00; \
1411 d &= 0xff00; \
1412 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1413 *dstp = d1 | d | 0xff000000; \
1414 ++srcp; \
1415 ++dstp; \
1416 widthvar--; \
1417 }
1418 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1419 if (width > 0) {
1420 int extrawidth = (width % 4);
1421 vector unsigned char valigner = VEC_ALIGNER(srcp);
1422 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1423 width -= extrawidth;
1424 while (width) {
1425 vector unsigned char voverflow;
1426 vector unsigned char vd;
1427
1428 /* s = *srcp */
1429 voverflow = (vector unsigned char)vec_ld(15, srcp);
1430 vs = vec_perm(vs, voverflow, valigner);
1431
1432 /* d = *dstp */
1433 vd = (vector unsigned char)vec_ld(0, dstp);
1434
1435 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1436
1437 /* set the alpha channel to full on */
1438 vd = vec_or(vd, valphamask);
1439
1440 /* *dstp = res */
1441 vec_st((vector unsigned int)vd, 0, dstp);
1442
1443 srcp += 4;
1444 dstp += 4;
1445 width -= 4;
1446 vs = voverflow;
1447 }
1448 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1449 }
1450#undef ONE_PIXEL_BLEND
1451
1452 srcp += srcskip;
1453 dstp += dstskip;
1454 }
1455}
1456#if __MWERKS__
1457#pragma altivec_model off
1458#endif
1459#endif /* SDL_ALTIVEC_BLITTERS */
1460
1461/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1462static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1463{
1464 int width = info->d_width;
1465 int height = info->d_height;
1466 Uint32 *srcp = (Uint32 *)info->s_pixels;
1467 int srcskip = info->s_skip >> 2;
1468 Uint32 *dstp = (Uint32 *)info->d_pixels;
1469 int dstskip = info->d_skip >> 2;
1470
1471 while(height--) {
1472 DUFFS_LOOP4({
1473 Uint32 s = *srcp++;
1474 Uint32 d = *dstp;
1475 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1476 + (s & d & 0x00010101)) | 0xff000000;
1477 }, width);
1478 srcp += srcskip;
1479 dstp += dstskip;
1480 }
1481}
1482
1483/* fast RGB888->(A)RGB888 blending with surface alpha */
1484static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1485{
1486 unsigned alpha = info->src->alpha;
1487 if(alpha == 128) {
1488 BlitRGBtoRGBSurfaceAlpha128(info);
1489 } else {
1490 int width = info->d_width;
1491 int height = info->d_height;
1492 Uint32 *srcp = (Uint32 *)info->s_pixels;
1493 int srcskip = info->s_skip >> 2;
1494 Uint32 *dstp = (Uint32 *)info->d_pixels;
1495 int dstskip = info->d_skip >> 2;
1496 Uint32 s;
1497 Uint32 d;
1498 Uint32 s1;
1499 Uint32 d1;
1500
1501 while(height--) {
1502 DUFFS_LOOP_DOUBLE2({
1503 /* One Pixel Blend */
1504 s = *srcp;
1505 d = *dstp;
1506 s1 = s & 0xff00ff;
1507 d1 = d & 0xff00ff;
1508 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1509 & 0xff00ff;
1510 s &= 0xff00;
1511 d &= 0xff00;
1512 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1513 *dstp = d1 | d | 0xff000000;
1514 ++srcp;
1515 ++dstp;
1516 },{
1517 /* Two Pixels Blend */
1518 s = *srcp;
1519 d = *dstp;
1520 s1 = s & 0xff00ff;
1521 d1 = d & 0xff00ff;
1522 d1 += (s1 - d1) * alpha >> 8;
1523 d1 &= 0xff00ff;
1524
1525 s = ((s & 0xff00) >> 8) |
1526 ((srcp[1] & 0xff00) << 8);
1527 d = ((d & 0xff00) >> 8) |
1528 ((dstp[1] & 0xff00) << 8);
1529 d += (s - d) * alpha >> 8;
1530 d &= 0x00ff00ff;
1531
1532 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1533 ++srcp;
1534
1535 s1 = *srcp;
1536 d1 = *dstp;
1537 s1 &= 0xff00ff;
1538 d1 &= 0xff00ff;
1539 d1 += (s1 - d1) * alpha >> 8;
1540 d1 &= 0xff00ff;
1541
1542 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1543 ++srcp;
1544 ++dstp;
1545 }, width);
1546 srcp += srcskip;
1547 dstp += dstskip;
1548 }
1549 }
1550}
1551
1552/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1553static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1554{
1555 int width = info->d_width;
1556 int height = info->d_height;
1557 Uint32 *srcp = (Uint32 *)info->s_pixels;
1558 int srcskip = info->s_skip >> 2;
1559 Uint32 *dstp = (Uint32 *)info->d_pixels;
1560 int dstskip = info->d_skip >> 2;
1561
1562 while(height--) {
1563 DUFFS_LOOP4({
1564 Uint32 dalpha;
1565 Uint32 d;
1566 Uint32 s1;
1567 Uint32 d1;
1568 Uint32 s = *srcp;
1569 Uint32 alpha = s >> 24;
1570 /* FIXME: Here we special-case opaque alpha since the
1571 compositioning used (>>8 instead of /255) doesn't handle
1572 it correctly. Also special-case alpha=0 for speed?
1573 Benchmark this! */
1574 if(alpha) {
1575 if(alpha == SDL_ALPHA_OPAQUE) {
1576 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1577 } else {
1578 /*
1579 * take out the middle component (green), and process
1580 * the other two in parallel. One multiply less.
1581 */
1582 d = *dstp;
1583 dalpha = d & 0xff000000;
1584 s1 = s & 0xff00ff;
1585 d1 = d & 0xff00ff;
1586 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1587 s &= 0xff00;
1588 d &= 0xff00;
1589 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1590 *dstp = d1 | d | dalpha;
1591 }
1592 }
1593 ++srcp;
1594 ++dstp;
1595 }, width);
1596 srcp += srcskip;
1597 dstp += dstskip;
1598 }
1599}
1600
1601#if GCC_ASMBLIT
1602/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1603static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1604{
1605 int width = info->d_width;
1606 int height = info->d_height;
1607 Uint32 *srcp = (Uint32 *)info->s_pixels;
1608 int srcskip = info->s_skip >> 2;
1609 Uint32 *dstp = (Uint32 *)info->d_pixels;
1610 int dstskip = info->d_skip >> 2;
1611 SDL_PixelFormat* sf = info->src;
1612 Uint32 amask = sf->Amask;
1613
1614 __asm__ (
1615 /* make mm6 all zeros. */
1616 "pxor %%mm6, %%mm6\n"
1617
1618 /* Make a mask to preserve the alpha. */
1619 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1620 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1621 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1622 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1623 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1624
1625 /* form channel masks */
1626 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1627 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1628 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1629 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1630
1631 /* get alpha channel shift */
1632 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1633
1634 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1635
1636 while(height--) {
1637
1638 DUFFS_LOOP4({
1639 Uint32 alpha;
1640
1641 __asm__ (
1642 "prefetch 64(%0)\n"
1643 "prefetch 64(%1)\n"
1644 : : "r" (srcp), "r" (dstp) );
1645
1646 alpha = *srcp & amask;
1647 /* FIXME: Here we special-case opaque alpha since the
1648 compositioning used (>>8 instead of /255) doesn't handle
1649 it correctly. Also special-case alpha=0 for speed?
1650 Benchmark this! */
1651 if(alpha == 0) {
1652 /* do nothing */
1653 }
1654 else if(alpha == amask) {
1655 /* opaque alpha -- copy RGB, keep dst alpha */
1656 /* using MMX here to free up regular registers for other things */
1657 __asm__ (
1658 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1659 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1660 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1661 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1662 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1663 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1664
1665 : : "r" (srcp), "r" (dstp) );
1666 }
1667
1668 else {
1669 __asm__ (
1670 /* load in the source, and dst. */
1671 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1672 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1673
1674 /* Move the src alpha into mm2 */
1675
1676 /* if supporting pshufw */
1677 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1678 /*"psrlw $8, %%mm2\n" */
1679
1680 /* else: */
1681 "movd %2, %%mm2\n"
1682 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1683 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1684 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1685 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1686
1687 /* move the colors into words. */
1688 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1689 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1690
1691 /* src - dst */
1692 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1693
1694 /* A * (src-dst) */
1695 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1696 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1697 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1698
1699 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1700
1701 "movd %%mm0, (%1)\n" /* result in mm0 */
1702
1703 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1704
1705 }
1706 ++srcp;
1707 ++dstp;
1708 }, width);
1709 srcp += srcskip;
1710 dstp += dstskip;
1711 }
1712
1713 __asm__ (
1714 "emms\n"
1715 : );
1716}
1717/* End GCC_ASMBLIT*/
1718
1719#elif MSVC_ASMBLIT
1720/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1721static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1722{
1723 int width = info->d_width;
1724 int height = info->d_height;
1725 Uint32 *srcp = (Uint32 *)info->s_pixels;
1726 int srcskip = info->s_skip >> 2;
1727 Uint32 *dstp = (Uint32 *)info->d_pixels;
1728 int dstskip = info->d_skip >> 2;
1729 SDL_PixelFormat* sf = info->src;
1730 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1731 Uint32 amask = sf->Amask;
1732 Uint32 ashift = sf->Ashift;
1733 Uint64 multmask;
1734
1735 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1736
1737 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1738 multmask = ~(0xFFFFi64 << (ashift * 2));
1739 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1740
1741 while(height--) {
1742 DUFFS_LOOP4({
1743 Uint32 alpha;
1744
1745 _m_prefetch(srcp + 16);
1746 _m_prefetch(dstp + 16);
1747
1748 alpha = *srcp & amask;
1749 if (alpha == 0) {
1750 /* do nothing */
1751 } else if (alpha == amask) {
1752 /* copy RGB, keep dst alpha */
1753 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1754 } else {
1755 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1756 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1757
1758 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1759 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1760
1761 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1762 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1763 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1764 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1765 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1766
1767 /* blend */
1768 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1769 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1770 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1771 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1772 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1773
1774 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1775 }
1776 ++srcp;
1777 ++dstp;
1778 }, width);
1779 srcp += srcskip;
1780 dstp += dstskip;
1781 }
1782 _mm_empty();
1783}
1784/* End MSVC_ASMBLIT */
1785
1786#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1787
1788/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1789
1790/* blend a single 16 bit pixel at 50% */
1791#define BLEND16_50(d, s, mask) \
1792 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1793
1794/* blend two 16 bit pixels at 50% */
1795#define BLEND2x16_50(d, s, mask) \
1796 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1797 + (s & d & (~(mask | mask << 16))))
1798
1799static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1800{
1801 int width = info->d_width;
1802 int height = info->d_height;
1803 Uint16 *srcp = (Uint16 *)info->s_pixels;
1804 int srcskip = info->s_skip >> 1;
1805 Uint16 *dstp = (Uint16 *)info->d_pixels;
1806 int dstskip = info->d_skip >> 1;
1807
1808 while(height--) {
1809 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1810 /*
1811 * Source and destination not aligned, pipeline it.
1812 * This is mostly a win for big blits but no loss for
1813 * small ones
1814 */
1815 Uint32 prev_sw;
1816 int w = width;
1817
1818 /* handle odd destination */
1819 if((uintptr_t)dstp & 2) {
1820 Uint16 d = *dstp, s = *srcp;
1821 *dstp = BLEND16_50(d, s, mask);
1822 dstp++;
1823 srcp++;
1824 w--;
1825 }
1826 srcp++; /* srcp is now 32-bit aligned */
1827
1828 /* bootstrap pipeline with first halfword */
1829 prev_sw = ((Uint32 *)srcp)[-1];
1830
1831 while(w > 1) {
1832 Uint32 sw, dw, s;
1833 sw = *(Uint32 *)srcp;
1834 dw = *(Uint32 *)dstp;
1835#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1836 s = (prev_sw << 16) + (sw >> 16);
1837#else
1838 s = (prev_sw >> 16) + (sw << 16);
1839#endif
1840 prev_sw = sw;
1841 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1842 dstp += 2;
1843 srcp += 2;
1844 w -= 2;
1845 }
1846
1847 /* final pixel if any */
1848 if(w) {
1849 Uint16 d = *dstp, s;
1850#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1851 s = (Uint16)prev_sw;
1852#else
1853 s = (Uint16)(prev_sw >> 16);
1854#endif
1855 *dstp = BLEND16_50(d, s, mask);
1856 srcp++;
1857 dstp++;
1858 }
1859 srcp += srcskip - 1;
1860 dstp += dstskip;
1861 } else {
1862 /* source and destination are aligned */
1863 int w = width;
1864
1865 /* first odd pixel? */
1866 if((uintptr_t)srcp & 2) {
1867 Uint16 d = *dstp, s = *srcp;
1868 *dstp = BLEND16_50(d, s, mask);
1869 srcp++;
1870 dstp++;
1871 w--;
1872 }
1873 /* srcp and dstp are now 32-bit aligned */
1874
1875 while(w > 1) {
1876 Uint32 sw = *(Uint32 *)srcp;
1877 Uint32 dw = *(Uint32 *)dstp;
1878 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1879 srcp += 2;
1880 dstp += 2;
1881 w -= 2;
1882 }
1883
1884 /* last odd pixel? */
1885 if(w) {
1886 Uint16 d = *dstp, s = *srcp;
1887 *dstp = BLEND16_50(d, s, mask);
1888 srcp++;
1889 dstp++;
1890 }
1891 srcp += srcskip;
1892 dstp += dstskip;
1893 }
1894 }
1895}
1896
1897#if GCC_ASMBLIT
1898/* fast RGB565->RGB565 blending with surface alpha */
1899static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1900{
1901 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1902 if(alpha == 128) {
1903 Blit16to16SurfaceAlpha128(info, 0xf7de);
1904 } else {
1905 int width = info->d_width;
1906 int height = info->d_height;
1907 Uint16 *srcp = (Uint16 *)info->s_pixels;
1908 int srcskip = info->s_skip >> 1;
1909 Uint16 *dstp = (Uint16 *)info->d_pixels;
1910 int dstskip = info->d_skip >> 1;
1911 Uint32 s, d;
1912 Uint64 load;
1913
1914 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1915 load = alpha;
1916 alpha >>= 3; /* downscale alpha to 5 bits */
1917
1918 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1919 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1920 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1921 /* position alpha to allow for mullo and mulhi on diff channels
1922 to reduce the number of operations */
1923 psllq_i2r(3, mm0);
1924
1925 /* Setup the 565 color channel masks */
1926 load = 0x07E007E007E007E0ULL;
1927 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1928 load = 0x001F001F001F001FULL;
1929 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1930 while(height--) {
1931 DUFFS_LOOP_QUATRO2(
1932 {
1933 s = *srcp++;
1934 d = *dstp;
1935 /*
1936 * shift out the middle component (green) to
1937 * the high 16 bits, and process all three RGB
1938 * components at the same time.
1939 */
1940 s = (s | s << 16) & 0x07e0f81f;
1941 d = (d | d << 16) & 0x07e0f81f;
1942 d += (s - d) * alpha >> 5;
1943 d &= 0x07e0f81f;
1944 *dstp++ = d | d >> 16;
1945 },{
1946 s = *srcp++;
1947 d = *dstp;
1948 /*
1949 * shift out the middle component (green) to
1950 * the high 16 bits, and process all three RGB
1951 * components at the same time.
1952 */
1953 s = (s | s << 16) & 0x07e0f81f;
1954 d = (d | d << 16) & 0x07e0f81f;
1955 d += (s - d) * alpha >> 5;
1956 d &= 0x07e0f81f;
1957 *dstp++ = d | d >> 16;
1958 s = *srcp++;
1959 d = *dstp;
1960 /*
1961 * shift out the middle component (green) to
1962 * the high 16 bits, and process all three RGB
1963 * components at the same time.
1964 */
1965 s = (s | s << 16) & 0x07e0f81f;
1966 d = (d | d << 16) & 0x07e0f81f;
1967 d += (s - d) * alpha >> 5;
1968 d &= 0x07e0f81f;
1969 *dstp++ = d | d >> 16;
1970 },{
1971 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1972 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1973
1974 /* red -- does not need a mask since the right shift clears
1975 the uninteresting bits */
1976 movq_r2r(mm2, mm5); /* src -> mm5 */
1977 movq_r2r(mm3, mm6); /* dst -> mm6 */
1978 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1979 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1980
1981 /* blend */
1982 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1983 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1984 /* alpha used is actually 11 bits
1985 11 + 5 = 16 bits, so the sign bits are lost */
1986 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1987 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1988 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1989
1990 movq_r2r(mm6, mm1); /* save new reds in dsts */
1991
1992 /* green -- process the bits in place */
1993 movq_r2r(mm2, mm5); /* src -> mm5 */
1994 movq_r2r(mm3, mm6); /* dst -> mm6 */
1995 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1996 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1997
1998 /* blend */
1999 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2000 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2001 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2002 bits are gone and the sign bits present */
2003 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2004 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2005
2006 por_r2r(mm6, mm1); /* save new greens in dsts */
2007
2008 /* blue */
2009 movq_r2r(mm2, mm5); /* src -> mm5 */
2010 movq_r2r(mm3, mm6); /* dst -> mm6 */
2011 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2012 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2013
2014 /* blend */
2015 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2016 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2017 /* 11 + 5 = 16 bits, so the sign bits are lost and
2018 the interesting bits will need to be MASKed */
2019 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2020 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2021 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2022
2023 por_r2r(mm6, mm1); /* save new blues in dsts */
2024
2025 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2026
2027 srcp += 4;
2028 dstp += 4;
2029 }, width);
2030 srcp += srcskip;
2031 dstp += dstskip;
2032 }
2033 emms();
2034 }
2035}
2036
2037/* fast RGB555->RGB555 blending with surface alpha */
2038static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2039{
2040 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2041 if(alpha == 128) {
2042 Blit16to16SurfaceAlpha128(info, 0xfbde);
2043 } else {
2044 int width = info->d_width;
2045 int height = info->d_height;
2046 Uint16 *srcp = (Uint16 *)info->s_pixels;
2047 int srcskip = info->s_skip >> 1;
2048 Uint16 *dstp = (Uint16 *)info->d_pixels;
2049 int dstskip = info->d_skip >> 1;
2050 Uint32 s, d;
2051 Uint64 load;
2052
2053 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2054 load = alpha;
2055 alpha >>= 3; /* downscale alpha to 5 bits */
2056
2057 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2058 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2059 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2060 /* position alpha to allow for mullo and mulhi on diff channels
2061 to reduce the number of operations */
2062 psllq_i2r(3, mm0);
2063
2064 /* Setup the 555 color channel masks */
2065 load = 0x03E003E003E003E0ULL;
2066 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2067 load = 0x001F001F001F001FULL;
2068 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2069 while(height--) {
2070 DUFFS_LOOP_QUATRO2(
2071 {
2072 s = *srcp++;
2073 d = *dstp;
2074 /*
2075 * shift out the middle component (green) to
2076 * the high 16 bits, and process all three RGB
2077 * components at the same time.
2078 */
2079 s = (s | s << 16) & 0x03e07c1f;
2080 d = (d | d << 16) & 0x03e07c1f;
2081 d += (s - d) * alpha >> 5;
2082 d &= 0x03e07c1f;
2083 *dstp++ = d | d >> 16;
2084 },{
2085 s = *srcp++;
2086 d = *dstp;
2087 /*
2088 * shift out the middle component (green) to
2089 * the high 16 bits, and process all three RGB
2090 * components at the same time.
2091 */
2092 s = (s | s << 16) & 0x03e07c1f;
2093 d = (d | d << 16) & 0x03e07c1f;
2094 d += (s - d) * alpha >> 5;
2095 d &= 0x03e07c1f;
2096 *dstp++ = d | d >> 16;
2097 s = *srcp++;
2098 d = *dstp;
2099 /*
2100 * shift out the middle component (green) to
2101 * the high 16 bits, and process all three RGB
2102 * components at the same time.
2103 */
2104 s = (s | s << 16) & 0x03e07c1f;
2105 d = (d | d << 16) & 0x03e07c1f;
2106 d += (s - d) * alpha >> 5;
2107 d &= 0x03e07c1f;
2108 *dstp++ = d | d >> 16;
2109 },{
2110 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2111 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2112
2113 /* red -- process the bits in place */
2114 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2115 /* by reusing the GREEN mask we free up another mmx
2116 register to accumulate the result */
2117
2118 movq_r2r(mm2, mm5); /* src -> mm5 */
2119 movq_r2r(mm3, mm6); /* dst -> mm6 */
2120 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2121 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2122
2123 /* blend */
2124 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2125 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2126 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2127 cleared by a MASK below */
2128 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2129 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2130 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2131
2132 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2133
2134 movq_r2r(mm6, mm1); /* save new reds in dsts */
2135
2136 /* green -- process the bits in place */
2137 movq_r2r(mm2, mm5); /* src -> mm5 */
2138 movq_r2r(mm3, mm6); /* dst -> mm6 */
2139 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2140 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2141
2142 /* blend */
2143 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2144 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2145 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2146 bits are gone and the sign bits present */
2147 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2148 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2149
2150 por_r2r(mm6, mm1); /* save new greens in dsts */
2151
2152 /* blue */
2153 movq_r2r(mm2, mm5); /* src -> mm5 */
2154 movq_r2r(mm3, mm6); /* dst -> mm6 */
2155 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2156 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2157
2158 /* blend */
2159 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2160 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2161 /* 11 + 5 = 16 bits, so the sign bits are lost and
2162 the interesting bits will need to be MASKed */
2163 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2164 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2165 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2166
2167 por_r2r(mm6, mm1); /* save new blues in dsts */
2168
2169 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2170
2171 srcp += 4;
2172 dstp += 4;
2173 }, width);
2174 srcp += srcskip;
2175 dstp += dstskip;
2176 }
2177 emms();
2178 }
2179}
2180/* End GCC_ASMBLIT */
2181
2182#elif MSVC_ASMBLIT
2183/* fast RGB565->RGB565 blending with surface alpha */
2184static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2185{
2186 unsigned alpha = info->src->alpha;
2187 if(alpha == 128) {
2188 Blit16to16SurfaceAlpha128(info, 0xf7de);
2189 } else {
2190 int width = info->d_width;
2191 int height = info->d_height;
2192 Uint16 *srcp = (Uint16 *)info->s_pixels;
2193 int srcskip = info->s_skip >> 1;
2194 Uint16 *dstp = (Uint16 *)info->d_pixels;
2195 int dstskip = info->d_skip >> 1;
2196 Uint32 s, d;
2197
2198 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2199
2200 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2201 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2202 alpha >>= 3; /* downscale alpha to 5 bits */
2203
2204 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2205 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2206 /* position alpha to allow for mullo and mulhi on diff channels
2207 to reduce the number of operations */
2208 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2209
2210 /* Setup the 565 color channel masks */
2211 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2212 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2213
2214 while(height--) {
2215 DUFFS_LOOP_QUATRO2(
2216 {
2217 s = *srcp++;
2218 d = *dstp;
2219 /*
2220 * shift out the middle component (green) to
2221 * the high 16 bits, and process all three RGB
2222 * components at the same time.
2223 */
2224 s = (s | s << 16) & 0x07e0f81f;
2225 d = (d | d << 16) & 0x07e0f81f;
2226 d += (s - d) * alpha >> 5;
2227 d &= 0x07e0f81f;
2228 *dstp++ = (Uint16)(d | d >> 16);
2229 },{
2230 s = *srcp++;
2231 d = *dstp;
2232 /*
2233 * shift out the middle component (green) to
2234 * the high 16 bits, and process all three RGB
2235 * components at the same time.
2236 */
2237 s = (s | s << 16) & 0x07e0f81f;
2238 d = (d | d << 16) & 0x07e0f81f;
2239 d += (s - d) * alpha >> 5;
2240 d &= 0x07e0f81f;
2241 *dstp++ = (Uint16)(d | d >> 16);
2242 s = *srcp++;
2243 d = *dstp;
2244 /*
2245 * shift out the middle component (green) to
2246 * the high 16 bits, and process all three RGB
2247 * components at the same time.
2248 */
2249 s = (s | s << 16) & 0x07e0f81f;
2250 d = (d | d << 16) & 0x07e0f81f;
2251 d += (s - d) * alpha >> 5;
2252 d &= 0x07e0f81f;
2253 *dstp++ = (Uint16)(d | d >> 16);
2254 },{
2255 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2256 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2257
2258 /* red */
2259 src2 = src1;
2260 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2261
2262 dst2 = dst1;
2263 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2264
2265 /* blend */
2266 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2267 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2268 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2269 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2270 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2271
2272 mm_res = dst2; /* RED -> mm_res */
2273
2274 /* green -- process the bits in place */
2275 src2 = src1;
2276 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2277
2278 dst2 = dst1;
2279 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2280
2281 /* blend */
2282 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2283 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2284 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2285 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2286
2287 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2288
2289 /* blue */
2290 src2 = src1;
2291 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2292
2293 dst2 = dst1;
2294 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2295
2296 /* blend */
2297 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2298 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2299 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2300 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2301 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2302
2303 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2304
2305 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2306
2307 srcp += 4;
2308 dstp += 4;
2309 }, width);
2310 srcp += srcskip;
2311 dstp += dstskip;
2312 }
2313 _mm_empty();
2314 }
2315}
2316
2317/* fast RGB555->RGB555 blending with surface alpha */
2318static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2319{
2320 unsigned alpha = info->src->alpha;
2321 if(alpha == 128) {
2322 Blit16to16SurfaceAlpha128(info, 0xfbde);
2323 } else {
2324 int width = info->d_width;
2325 int height = info->d_height;
2326 Uint16 *srcp = (Uint16 *)info->s_pixels;
2327 int srcskip = info->s_skip >> 1;
2328 Uint16 *dstp = (Uint16 *)info->d_pixels;
2329 int dstskip = info->d_skip >> 1;
2330 Uint32 s, d;
2331
2332 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2333
2334 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2335 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2336 alpha >>= 3; /* downscale alpha to 5 bits */
2337
2338 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2339 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2340 /* position alpha to allow for mullo and mulhi on diff channels
2341 to reduce the number of operations */
2342 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2343
2344 /* Setup the 555 color channel masks */
2345 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2346 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2347 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2348
2349 while(height--) {
2350 DUFFS_LOOP_QUATRO2(
2351 {
2352 s = *srcp++;
2353 d = *dstp;
2354 /*
2355 * shift out the middle component (green) to
2356 * the high 16 bits, and process all three RGB
2357 * components at the same time.
2358 */
2359 s = (s | s << 16) & 0x03e07c1f;
2360 d = (d | d << 16) & 0x03e07c1f;
2361 d += (s - d) * alpha >> 5;
2362 d &= 0x03e07c1f;
2363 *dstp++ = (Uint16)(d | d >> 16);
2364 },{
2365 s = *srcp++;
2366 d = *dstp;
2367 /*
2368 * shift out the middle component (green) to
2369 * the high 16 bits, and process all three RGB
2370 * components at the same time.
2371 */
2372 s = (s | s << 16) & 0x03e07c1f;
2373 d = (d | d << 16) & 0x03e07c1f;
2374 d += (s - d) * alpha >> 5;
2375 d &= 0x03e07c1f;
2376 *dstp++ = (Uint16)(d | d >> 16);
2377 s = *srcp++;
2378 d = *dstp;
2379 /*
2380 * shift out the middle component (green) to
2381 * the high 16 bits, and process all three RGB
2382 * components at the same time.
2383 */
2384 s = (s | s << 16) & 0x03e07c1f;
2385 d = (d | d << 16) & 0x03e07c1f;
2386 d += (s - d) * alpha >> 5;
2387 d &= 0x03e07c1f;
2388 *dstp++ = (Uint16)(d | d >> 16);
2389 },{
2390 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2391 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2392
2393 /* red -- process the bits in place */
2394 src2 = src1;
2395 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2396
2397 dst2 = dst1;
2398 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2399
2400 /* blend */
2401 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2402 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2403 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2404 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2405 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2406
2407 mm_res = dst2; /* RED -> mm_res */
2408
2409 /* green -- process the bits in place */
2410 src2 = src1;
2411 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2412
2413 dst2 = dst1;
2414 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2415
2416 /* blend */
2417 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2418 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2419 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2420 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2421
2422 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2423
2424 /* blue */
2425 src2 = src1; /* src -> src2 */
2426 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2427
2428 dst2 = dst1; /* dst -> dst2 */
2429 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2430
2431 /* blend */
2432 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2433 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2434 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2435 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2436 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2437
2438 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2439
2440 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2441
2442 srcp += 4;
2443 dstp += 4;
2444 }, width);
2445 srcp += srcskip;
2446 dstp += dstskip;
2447 }
2448 _mm_empty();
2449 }
2450}
2451#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2452
2453/* fast RGB565->RGB565 blending with surface alpha */
2454static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2455{
2456 unsigned alpha = info->src->alpha;
2457 if(alpha == 128) {
2458 Blit16to16SurfaceAlpha128(info, 0xf7de);
2459 } else {
2460 int width = info->d_width;
2461 int height = info->d_height;
2462 Uint16 *srcp = (Uint16 *)info->s_pixels;
2463 int srcskip = info->s_skip >> 1;
2464 Uint16 *dstp = (Uint16 *)info->d_pixels;
2465 int dstskip = info->d_skip >> 1;
2466 alpha >>= 3; /* downscale alpha to 5 bits */
2467
2468 while(height--) {
2469 DUFFS_LOOP4({
2470 Uint32 s = *srcp++;
2471 Uint32 d = *dstp;
2472 /*
2473 * shift out the middle component (green) to
2474 * the high 16 bits, and process all three RGB
2475 * components at the same time.
2476 */
2477 s = (s | s << 16) & 0x07e0f81f;
2478 d = (d | d << 16) & 0x07e0f81f;
2479 d += (s - d) * alpha >> 5;
2480 d &= 0x07e0f81f;
2481 *dstp++ = (Uint16)(d | d >> 16);
2482 }, width);
2483 srcp += srcskip;
2484 dstp += dstskip;
2485 }
2486 }
2487}
2488
2489/* fast RGB555->RGB555 blending with surface alpha */
2490static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2491{
2492 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2493 if(alpha == 128) {
2494 Blit16to16SurfaceAlpha128(info, 0xfbde);
2495 } else {
2496 int width = info->d_width;
2497 int height = info->d_height;
2498 Uint16 *srcp = (Uint16 *)info->s_pixels;
2499 int srcskip = info->s_skip >> 1;
2500 Uint16 *dstp = (Uint16 *)info->d_pixels;
2501 int dstskip = info->d_skip >> 1;
2502 alpha >>= 3; /* downscale alpha to 5 bits */
2503
2504 while(height--) {
2505 DUFFS_LOOP4({
2506 Uint32 s = *srcp++;
2507 Uint32 d = *dstp;
2508 /*
2509 * shift out the middle component (green) to
2510 * the high 16 bits, and process all three RGB
2511 * components at the same time.
2512 */
2513 s = (s | s << 16) & 0x03e07c1f;
2514 d = (d | d << 16) & 0x03e07c1f;
2515 d += (s - d) * alpha >> 5;
2516 d &= 0x03e07c1f;
2517 *dstp++ = (Uint16)(d | d >> 16);
2518 }, width);
2519 srcp += srcskip;
2520 dstp += dstskip;
2521 }
2522 }
2523}
2524
2525/* fast ARGB8888->RGB565 blending with pixel alpha */
2526static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2527{
2528 int width = info->d_width;
2529 int height = info->d_height;
2530 Uint32 *srcp = (Uint32 *)info->s_pixels;
2531 int srcskip = info->s_skip >> 2;
2532 Uint16 *dstp = (Uint16 *)info->d_pixels;
2533 int dstskip = info->d_skip >> 1;
2534
2535 while(height--) {
2536 DUFFS_LOOP4({
2537 Uint32 s = *srcp;
2538 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2539 /* FIXME: Here we special-case opaque alpha since the
2540 compositioning used (>>8 instead of /255) doesn't handle
2541 it correctly. Also special-case alpha=0 for speed?
2542 Benchmark this! */
2543 if(alpha) {
2544 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2545 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2546 } else {
2547 Uint32 d = *dstp;
2548 /*
2549 * convert source and destination to G0RAB65565
2550 * and blend all components at the same time
2551 */
2552 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2553 + (s >> 3 & 0x1f);
2554 d = (d | d << 16) & 0x07e0f81f;
2555 d += (s - d) * alpha >> 5;
2556 d &= 0x07e0f81f;
2557 *dstp = (Uint16)(d | d >> 16);
2558 }
2559 }
2560 srcp++;
2561 dstp++;
2562 }, width);
2563 srcp += srcskip;
2564 dstp += dstskip;
2565 }
2566}
2567
2568/* fast ARGB8888->RGB555 blending with pixel alpha */
2569static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2570{
2571 int width = info->d_width;
2572 int height = info->d_height;
2573 Uint32 *srcp = (Uint32 *)info->s_pixels;
2574 int srcskip = info->s_skip >> 2;
2575 Uint16 *dstp = (Uint16 *)info->d_pixels;
2576 int dstskip = info->d_skip >> 1;
2577
2578 while(height--) {
2579 DUFFS_LOOP4({
2580 unsigned alpha;
2581 Uint32 s = *srcp;
2582 alpha = s >> 27; /* downscale alpha to 5 bits */
2583 /* FIXME: Here we special-case opaque alpha since the
2584 compositioning used (>>8 instead of /255) doesn't handle
2585 it correctly. Also special-case alpha=0 for speed?
2586 Benchmark this! */
2587 if(alpha) {
2588 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2589 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2590 } else {
2591 Uint32 d = *dstp;
2592 /*
2593 * convert source and destination to G0RAB65565
2594 * and blend all components at the same time
2595 */
2596 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2597 + (s >> 3 & 0x1f);
2598 d = (d | d << 16) & 0x03e07c1f;
2599 d += (s - d) * alpha >> 5;
2600 d &= 0x03e07c1f;
2601 *dstp = (Uint16)(d | d >> 16);
2602 }
2603 }
2604 srcp++;
2605 dstp++;
2606 }, width);
2607 srcp += srcskip;
2608 dstp += dstskip;
2609 }
2610}
2611
2612/* General (slow) N->N blending with per-surface alpha */
2613static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2614{
2615 int width = info->d_width;
2616 int height = info->d_height;
2617 Uint8 *src = info->s_pixels;
2618 int srcskip = info->s_skip;
2619 Uint8 *dst = info->d_pixels;
2620 int dstskip = info->d_skip;
2621 SDL_PixelFormat *srcfmt = info->src;
2622 SDL_PixelFormat *dstfmt = info->dst;
2623 int srcbpp = srcfmt->BytesPerPixel;
2624 int dstbpp = dstfmt->BytesPerPixel;
2625 unsigned sA = srcfmt->alpha;
2626 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2627
2628 if(sA) {
2629 while ( height-- ) {
2630 DUFFS_LOOP4(
2631 {
2632 Uint32 Pixel;
2633 unsigned sR;
2634 unsigned sG;
2635 unsigned sB;
2636 unsigned dR;
2637 unsigned dG;
2638 unsigned dB;
2639 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2640 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2641 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2642 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2643 src += srcbpp;
2644 dst += dstbpp;
2645 },
2646 width);
2647 src += srcskip;
2648 dst += dstskip;
2649 }
2650 }
2651}
2652
2653/* General (slow) colorkeyed N->N blending with per-surface alpha */
2654static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2655{
2656 int width = info->d_width;
2657 int height = info->d_height;
2658 Uint8 *src = info->s_pixels;
2659 int srcskip = info->s_skip;
2660 Uint8 *dst = info->d_pixels;
2661 int dstskip = info->d_skip;
2662 SDL_PixelFormat *srcfmt = info->src;
2663 SDL_PixelFormat *dstfmt = info->dst;
2664 Uint32 ckey = srcfmt->colorkey;
2665 int srcbpp = srcfmt->BytesPerPixel;
2666 int dstbpp = dstfmt->BytesPerPixel;
2667 unsigned sA = srcfmt->alpha;
2668 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2669
211e4bff 2670 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2671 Uint16 *src16 = (Uint16 *)src;
2672 Uint16 *dst16 = (Uint16 *)dst;
2673 sA >>= 3; /* downscale alpha to 5 bits */
2674 while ( height-- ) {
2675 DUFFS_LOOP4(
2676 {
2677 Uint32 s;
2678 Uint32 d;
2679 s = *src16;
2680 if(sA && s != ckey) {
2681 d = *dst16;
2682 s = (s | s << 16) & 0x07e0f81f;
2683 d = (d | d << 16) & 0x07e0f81f;
2684 d += (s - d) * sA >> 5;
2685 d &= 0x07e0f81f;
2686 *dst16 = (Uint16)(d | d >> 16);
2687 }
2688 src16++;
2689 dst16++;
2690 },
2691 width);
2692 src16 += srcskip / 2;
2693 dst16 += dstskip / 2;
2694 }
2695 return;
2696 }
2697
e14743d1 2698 while ( height-- ) {
2699 DUFFS_LOOP4(
2700 {
2701 Uint32 Pixel;
2702 unsigned sR;
2703 unsigned sG;
2704 unsigned sB;
2705 unsigned dR;
2706 unsigned dG;
2707 unsigned dB;
2708 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2709 if(sA && Pixel != ckey) {
2710 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2711 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2712 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2713 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2714 }
2715 src += srcbpp;
2716 dst += dstbpp;
2717 },
2718 width);
2719 src += srcskip;
2720 dst += dstskip;
2721 }
2722}
2723
2724/* General (slow) N->N blending with pixel alpha */
2725static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2726{
2727 int width = info->d_width;
2728 int height = info->d_height;
2729 Uint8 *src = info->s_pixels;
2730 int srcskip = info->s_skip;
2731 Uint8 *dst = info->d_pixels;
2732 int dstskip = info->d_skip;
2733 SDL_PixelFormat *srcfmt = info->src;
2734 SDL_PixelFormat *dstfmt = info->dst;
2735
2736 int srcbpp;
2737 int dstbpp;
2738
2739 /* Set up some basic variables */
2740 srcbpp = srcfmt->BytesPerPixel;
2741 dstbpp = dstfmt->BytesPerPixel;
2742
2743 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2744 quite right. for <8bpp source alpha, it gets them very wrong
2745 (check all macros!)
2746 It is unclear whether there is a good general solution that doesn't
2747 need a branch (or a divide). */
2748 while ( height-- ) {
2749 DUFFS_LOOP4(
2750 {
2751 Uint32 Pixel;
2752 unsigned sR;
2753 unsigned sG;
2754 unsigned sB;
2755 unsigned dR;
2756 unsigned dG;
2757 unsigned dB;
2758 unsigned sA;
2759 unsigned dA;
2760 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2761 if(sA) {
2762 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2763 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2764 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2765 }
2766 src += srcbpp;
2767 dst += dstbpp;
2768 },
2769 width);
2770 src += srcskip;
2771 dst += dstskip;
2772 }
2773}
2774
2775
2776SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2777{
2778 SDL_PixelFormat *sf = surface->format;
2779 SDL_PixelFormat *df = surface->map->dst->format;
2780
2781 if(sf->Amask == 0) {
2782 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2783 if(df->BytesPerPixel == 1)
2784 return BlitNto1SurfaceAlphaKey;
2785 else
2786#if SDL_ALTIVEC_BLITTERS
2787 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2788 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2789 return Blit32to32SurfaceAlphaKeyAltivec;
2790 else
2791#endif
2792 return BlitNtoNSurfaceAlphaKey;
2793 } else {
2794 /* Per-surface alpha blits */
2795 switch(df->BytesPerPixel) {
2796 case 1:
2797 return BlitNto1SurfaceAlpha;
2798
2799 case 2:
2800 if(surface->map->identity) {
2801 if(df->Gmask == 0x7e0)
2802 {
2803#if MMX_ASMBLIT
2804 if(SDL_HasMMX())
2805 return Blit565to565SurfaceAlphaMMX;
2806 else
2807#endif
2808 return Blit565to565SurfaceAlpha;
2809 }
2810 else if(df->Gmask == 0x3e0)
2811 {
2812#if MMX_ASMBLIT
2813 if(SDL_HasMMX())
2814 return Blit555to555SurfaceAlphaMMX;
2815 else
2816#endif
2817 return Blit555to555SurfaceAlpha;
2818 }
2819 }
2820 return BlitNtoNSurfaceAlpha;
2821
2822 case 4:
2823 if(sf->Rmask == df->Rmask
2824 && sf->Gmask == df->Gmask
2825 && sf->Bmask == df->Bmask
2826 && sf->BytesPerPixel == 4)
2827 {
2828#if MMX_ASMBLIT
2829 if(sf->Rshift % 8 == 0
2830 && sf->Gshift % 8 == 0
2831 && sf->Bshift % 8 == 0
2832 && SDL_HasMMX())
2833 return BlitRGBtoRGBSurfaceAlphaMMX;
2834#endif
2835 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2836 {
2837#if SDL_ALTIVEC_BLITTERS
2838 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2839 && SDL_HasAltiVec())
2840 return BlitRGBtoRGBSurfaceAlphaAltivec;
2841#endif
2842 return BlitRGBtoRGBSurfaceAlpha;
2843 }
2844 }
2845#if SDL_ALTIVEC_BLITTERS
2846 if((sf->BytesPerPixel == 4) &&
2847 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2848 return Blit32to32SurfaceAlphaAltivec;
2849 else
2850#endif
2851 return BlitNtoNSurfaceAlpha;
2852
2853 case 3:
2854 default:
2855 return BlitNtoNSurfaceAlpha;
2856 }
2857 }
2858 } else {
2859 /* Per-pixel alpha blits */
2860 switch(df->BytesPerPixel) {
2861 case 1:
2862 return BlitNto1PixelAlpha;
2863
2864 case 2:
2865#if SDL_ALTIVEC_BLITTERS
2866 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2867 df->Gmask == 0x7e0 &&
2868 df->Bmask == 0x1f && SDL_HasAltiVec())
2869 return Blit32to565PixelAlphaAltivec;
2870 else
2871#endif
2872 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2873 && sf->Gmask == 0xff00
2874 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2875 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2876 if(df->Gmask == 0x7e0)
2877 return BlitARGBto565PixelAlpha;
2878 else if(df->Gmask == 0x3e0)
2879 return BlitARGBto555PixelAlpha;
2880 }
2881 return BlitNtoNPixelAlpha;
2882
2883 case 4:
2884 if(sf->Rmask == df->Rmask
2885 && sf->Gmask == df->Gmask
2886 && sf->Bmask == df->Bmask
2887 && sf->BytesPerPixel == 4)
2888 {
2889#if MMX_ASMBLIT
2890 if(sf->Rshift % 8 == 0
2891 && sf->Gshift % 8 == 0
2892 && sf->Bshift % 8 == 0
2893 && sf->Ashift % 8 == 0
2894 && sf->Aloss == 0)
2895 {
2896 if(SDL_Has3DNow())
2897 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2898 if(SDL_HasMMX())
2899 return BlitRGBtoRGBPixelAlphaMMX;
2900 }
2901#endif
2902 if(sf->Amask == 0xff000000)
2903 {
2904#if SDL_ALTIVEC_BLITTERS
2905 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2906 && SDL_HasAltiVec())
2907 return BlitRGBtoRGBPixelAlphaAltivec;
a1f34081 2908#endif
2909#ifdef __ARM_NEON__
2910 return BlitARGBtoXRGBalpha_neon;
e14743d1 2911#endif
2912 return BlitRGBtoRGBPixelAlpha;
2913 }
2914 }
a1f34081 2915#ifdef __ARM_NEON__
2916 if (sf->Gmask == df->Gmask && sf->Amask == 0xff000000 &&
2917 ((sf->Rmask == 0xff && df->Rmask == 0xff0000 && sf->Bmask == 0xff0000 && df->Bmask == 0xff) ||
2918 (sf->Rmask == 0xff0000 && df->Rmask == 0xff && sf->Bmask == 0xff && df->Bmask == 0xff0000)))
2919 {
2920 return BlitABGRtoXRGBalpha_neon;
2921 }
2922#endif
e14743d1 2923#if SDL_ALTIVEC_BLITTERS
2924 if (sf->Amask && sf->BytesPerPixel == 4 &&
2925 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2926 return Blit32to32PixelAlphaAltivec;
2927 else
2928#endif
2929 return BlitNtoNPixelAlpha;
2930
2931 case 3:
2932 default:
2933 return BlitNtoNPixelAlpha;
2934 }
2935 }
2936}
2937