refactor NEON blit checks
[sdl_omap.git] / src / video / SDL_blit_A.c
CommitLineData
e14743d1 1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35# define MMX_ASMBLIT 1
36# define GCC_ASMBLIT 1
37# elif defined(_MSC_VER) && defined(_M_IX86)
38# if (_MSC_VER <= 1200)
39# include <malloc.h>
40# if defined(_mm_free)
41# define HAVE_MMINTRIN_H 1
42# endif
43# else /* Visual Studio > VC6 always has mmintrin.h */
44# define HAVE_MMINTRIN_H 1
45# endif
46# if HAVE_MMINTRIN_H
47# define MMX_ASMBLIT 1
48# define MSVC_ASMBLIT 1
49# endif
50# endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
a1f34081 64#ifdef __ARM_NEON__
65
66/* NEON optimized blitter callers */
67#define make_neon_caller(name, neon_name) \
68extern void neon_name(void *dst, const void *src, int count); \
69static void name(SDL_BlitInfo *info) \
70{ \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
75 int srcskip = info->s_skip; \
76 int dstskip = info->d_skip; \
77\
78 while ( height-- ) { \
79 neon_name(dst, src, width); \
80 src += width * 4 + srcskip; \
81 dst += width * 4 + dstskip; \
82 } \
83}
84
bdfa6989 85#define make_neon_callerS(name, neon_name) \
86extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
87static void name(SDL_BlitInfo *info) \
88{ \
89 int width = info->d_width; \
90 int height = info->d_height; \
91 Uint8 *src = info->s_pixels; \
92 Uint8 *dst = info->d_pixels; \
93 int srcskip = info->s_skip; \
94 int dstskip = info->d_skip; \
95 unsigned alpha = info->src->alpha;\
96\
97 while ( height-- ) { \
98 neon_name(dst, src, width, alpha); \
99 src += width * 4 + srcskip; \
100 dst += width * 4 + dstskip; \
101 } \
102}
103
a1f34081 104make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
105make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
bdfa6989 106make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
107make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
a1f34081 108
109#endif /* __ARM_NEON__ */
110
e14743d1 111/* N->1 blending with per-surface alpha */
112static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
113{
114 int width = info->d_width;
115 int height = info->d_height;
116 Uint8 *src = info->s_pixels;
117 int srcskip = info->s_skip;
118 Uint8 *dst = info->d_pixels;
119 int dstskip = info->d_skip;
120 Uint8 *palmap = info->table;
121 SDL_PixelFormat *srcfmt = info->src;
122 SDL_PixelFormat *dstfmt = info->dst;
123 int srcbpp = srcfmt->BytesPerPixel;
124
125 const unsigned A = srcfmt->alpha;
126
127 while ( height-- ) {
128 DUFFS_LOOP4(
129 {
130 Uint32 Pixel;
131 unsigned sR;
132 unsigned sG;
133 unsigned sB;
134 unsigned dR;
135 unsigned dG;
136 unsigned dB;
137 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
138 dR = dstfmt->palette->colors[*dst].r;
139 dG = dstfmt->palette->colors[*dst].g;
140 dB = dstfmt->palette->colors[*dst].b;
141 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
142 dR &= 0xff;
143 dG &= 0xff;
144 dB &= 0xff;
145 /* Pack RGB into 8bit pixel */
146 if ( palmap == NULL ) {
147 *dst =((dR>>5)<<(3+2))|
148 ((dG>>5)<<(2))|
149 ((dB>>6)<<(0));
150 } else {
151 *dst = palmap[((dR>>5)<<(3+2))|
152 ((dG>>5)<<(2)) |
153 ((dB>>6)<<(0))];
154 }
155 dst++;
156 src += srcbpp;
157 },
158 width);
159 src += srcskip;
160 dst += dstskip;
161 }
162}
163
164/* N->1 blending with pixel alpha */
165static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
166{
167 int width = info->d_width;
168 int height = info->d_height;
169 Uint8 *src = info->s_pixels;
170 int srcskip = info->s_skip;
171 Uint8 *dst = info->d_pixels;
172 int dstskip = info->d_skip;
173 Uint8 *palmap = info->table;
174 SDL_PixelFormat *srcfmt = info->src;
175 SDL_PixelFormat *dstfmt = info->dst;
176 int srcbpp = srcfmt->BytesPerPixel;
177
178 /* FIXME: fix alpha bit field expansion here too? */
179 while ( height-- ) {
180 DUFFS_LOOP4(
181 {
182 Uint32 Pixel;
183 unsigned sR;
184 unsigned sG;
185 unsigned sB;
186 unsigned sA;
187 unsigned dR;
188 unsigned dG;
189 unsigned dB;
190 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
191 dR = dstfmt->palette->colors[*dst].r;
192 dG = dstfmt->palette->colors[*dst].g;
193 dB = dstfmt->palette->colors[*dst].b;
194 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
195 dR &= 0xff;
196 dG &= 0xff;
197 dB &= 0xff;
198 /* Pack RGB into 8bit pixel */
199 if ( palmap == NULL ) {
200 *dst =((dR>>5)<<(3+2))|
201 ((dG>>5)<<(2))|
202 ((dB>>6)<<(0));
203 } else {
204 *dst = palmap[((dR>>5)<<(3+2))|
205 ((dG>>5)<<(2)) |
206 ((dB>>6)<<(0)) ];
207 }
208 dst++;
209 src += srcbpp;
210 },
211 width);
212 src += srcskip;
213 dst += dstskip;
214 }
215}
216
217/* colorkeyed N->1 blending with per-surface alpha */
218static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
219{
220 int width = info->d_width;
221 int height = info->d_height;
222 Uint8 *src = info->s_pixels;
223 int srcskip = info->s_skip;
224 Uint8 *dst = info->d_pixels;
225 int dstskip = info->d_skip;
226 Uint8 *palmap = info->table;
227 SDL_PixelFormat *srcfmt = info->src;
228 SDL_PixelFormat *dstfmt = info->dst;
229 int srcbpp = srcfmt->BytesPerPixel;
230 Uint32 ckey = srcfmt->colorkey;
231
232 const int A = srcfmt->alpha;
233
234 while ( height-- ) {
235 DUFFS_LOOP(
236 {
237 Uint32 Pixel;
238 unsigned sR;
239 unsigned sG;
240 unsigned sB;
241 unsigned dR;
242 unsigned dG;
243 unsigned dB;
244 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
245 if ( Pixel != ckey ) {
246 dR = dstfmt->palette->colors[*dst].r;
247 dG = dstfmt->palette->colors[*dst].g;
248 dB = dstfmt->palette->colors[*dst].b;
249 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
250 dR &= 0xff;
251 dG &= 0xff;
252 dB &= 0xff;
253 /* Pack RGB into 8bit pixel */
254 if ( palmap == NULL ) {
255 *dst =((dR>>5)<<(3+2))|
256 ((dG>>5)<<(2)) |
257 ((dB>>6)<<(0));
258 } else {
259 *dst = palmap[((dR>>5)<<(3+2))|
260 ((dG>>5)<<(2)) |
261 ((dB>>6)<<(0)) ];
262 }
263 }
264 dst++;
265 src += srcbpp;
266 },
267 width);
268 src += srcskip;
269 dst += dstskip;
270 }
271}
272
273#if GCC_ASMBLIT
274/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
275static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
276{
277 int width = info->d_width;
278 int height = info->d_height;
279 Uint32 *srcp = (Uint32 *)info->s_pixels;
280 int srcskip = info->s_skip >> 2;
281 Uint32 *dstp = (Uint32 *)info->d_pixels;
282 int dstskip = info->d_skip >> 2;
283 Uint32 dalpha = info->dst->Amask;
284 Uint64 load;
285
286 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
287 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
288 load = 0x0001010100010101ULL;/* !alpha128 mask */
289 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
290 movd_m2r(dalpha, mm7); /* dst alpha mask */
291 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
292 while(height--) {
293 DUFFS_LOOP_DOUBLE2(
294 {
295 Uint32 s = *srcp++;
296 Uint32 d = *dstp;
297 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
298 + (s & d & 0x00010101)) | dalpha;
299 },{
300 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
301 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
302
303 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
304 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
305
306 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
307 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
308 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
309 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
310 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
311 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
312 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
313
314 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
315 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
316 dstp += 2;
317 srcp += 2;
318 }, width);
319 srcp += srcskip;
320 dstp += dstskip;
321 }
322 emms();
323}
324
325/* fast RGB888->(A)RGB888 blending with surface alpha */
326static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
327{
328 SDL_PixelFormat* df = info->dst;
329 unsigned alpha = info->src->alpha;
330
331 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
332 /* only call a128 version when R,G,B occupy lower bits */
333 BlitRGBtoRGBSurfaceAlpha128MMX(info);
334 } else {
335 int width = info->d_width;
336 int height = info->d_height;
337 Uint32 *srcp = (Uint32 *)info->s_pixels;
338 int srcskip = info->s_skip >> 2;
339 Uint32 *dstp = (Uint32 *)info->d_pixels;
340 int dstskip = info->d_skip >> 2;
341
342 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
343 /* form the alpha mult */
344 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
345 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
346 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
347 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
348 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
349 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
350 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
351 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
352 movd_m2r(df->Amask, mm7); /* dst alpha mask */
353 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
354
355 while(height--) {
356 DUFFS_LOOP_DOUBLE2({
357 /* One Pixel Blend */
358 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
359 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
360 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
361 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
362
363 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
364 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
365 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
366 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
367
368 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
369 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
370 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
371 ++srcp;
372 ++dstp;
373 },{
374 /* Two Pixels Blend */
375 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
376 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
377 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
378 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
379
380 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
381 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
382 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
383 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
384
385 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
386 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
387 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
388 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
389
390 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
391 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
392 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
393 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
394
395 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
396 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
397
398 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
399
400 srcp += 2;
401 dstp += 2;
402 }, width);
403 srcp += srcskip;
404 dstp += dstskip;
405 }
406 emms();
407 }
408}
409
410/* fast ARGB888->(A)RGB888 blending with pixel alpha */
411static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
412{
413 int width = info->d_width;
414 int height = info->d_height;
415 Uint32 *srcp = (Uint32 *)info->s_pixels;
416 int srcskip = info->s_skip >> 2;
417 Uint32 *dstp = (Uint32 *)info->d_pixels;
418 int dstskip = info->d_skip >> 2;
419 SDL_PixelFormat* sf = info->src;
420 Uint32 amask = sf->Amask;
421
422 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
423 /* form multiplication mask */
424 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
425 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
426 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
427 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
428 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
429 /* form channel masks */
430 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
431 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
432 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
433 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
434 /* get alpha channel shift */
435 __asm__ __volatile__ (
436 "movd %0, %%mm5"
437 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
438
439 while(height--) {
440 DUFFS_LOOP4({
441 Uint32 alpha = *srcp & amask;
442 /* FIXME: Here we special-case opaque alpha since the
443 compositioning used (>>8 instead of /255) doesn't handle
444 it correctly. Also special-case alpha=0 for speed?
445 Benchmark this! */
446 if(alpha == 0) {
447 /* do nothing */
448 } else if(alpha == amask) {
449 /* opaque alpha -- copy RGB, keep dst alpha */
450 /* using MMX here to free up regular registers for other things */
451 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
452 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
453 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
454 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
455 por_r2r(mm1, mm2); /* src | dst -> mm2 */
456 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
457 } else {
458 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
459 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
460
461 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
462 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
463
464 __asm__ __volatile__ (
465 "movd %0, %%mm4"
466 : : "r" (alpha) ); /* 0000A000 -> mm4 */
467 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
468 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
469 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
470 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
471
472 /* blend */
473 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
474 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
475 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
476 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
477
478 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
479 movd_r2m(mm2, *dstp);/* mm2 -> dst */
480 }
481 ++srcp;
482 ++dstp;
483 }, width);
484 srcp += srcskip;
485 dstp += dstskip;
486 }
487 emms();
488}
489/* End GCC_ASMBLIT */
490
491#elif MSVC_ASMBLIT
492/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
493static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
494{
495 int width = info->d_width;
496 int height = info->d_height;
497 Uint32 *srcp = (Uint32 *)info->s_pixels;
498 int srcskip = info->s_skip >> 2;
499 Uint32 *dstp = (Uint32 *)info->d_pixels;
500 int dstskip = info->d_skip >> 2;
501 Uint32 dalpha = info->dst->Amask;
502
503 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
504
505 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
506 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
507 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
508
509 while (height--) {
510 int n = width;
511 if ( n & 1 ) {
512 Uint32 s = *srcp++;
513 Uint32 d = *dstp;
514 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
515 + (s & d & 0x00010101)) | dalpha;
516 n--;
517 }
518
519 for (n >>= 1; n > 0; --n) {
520 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
521 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
522
523 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
524 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
525
526 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
527 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
528 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
529 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
530
531 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
532 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
533 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
534 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
535
536 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
537 dstp += 2;
538 srcp += 2;
539 }
540
541 srcp += srcskip;
542 dstp += dstskip;
543 }
544 _mm_empty();
545}
546
547/* fast RGB888->(A)RGB888 blending with surface alpha */
548static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
549{
550 SDL_PixelFormat* df = info->dst;
551 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
552 unsigned alpha = info->src->alpha;
553
554 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
555 /* only call a128 version when R,G,B occupy lower bits */
556 BlitRGBtoRGBSurfaceAlpha128MMX(info);
557 } else {
558 int width = info->d_width;
559 int height = info->d_height;
560 Uint32 *srcp = (Uint32 *)info->s_pixels;
561 int srcskip = info->s_skip >> 2;
562 Uint32 *dstp = (Uint32 *)info->d_pixels;
563 int dstskip = info->d_skip >> 2;
564 Uint32 dalpha = df->Amask;
565 Uint32 amult;
566
567 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
568
569 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
570 /* form the alpha mult */
571 amult = alpha | (alpha << 8);
572 amult = amult | (amult << 16);
573 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
574 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
575 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
576 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
577 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
578
579 while (height--) {
580 int n = width;
581 if (n & 1) {
582 /* One Pixel Blend */
583 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
584 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
585
586 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
587 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
588
589 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
590 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
591 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
592 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
593
594 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
595 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
596 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
597
598 ++srcp;
599 ++dstp;
600
601 n--;
602 }
603
604 for (n >>= 1; n > 0; --n) {
605 /* Two Pixels Blend */
606 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
607 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
608 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
609 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
610
611 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
612 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
613 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
614 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
615
616 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
617 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
618 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
619 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
620
621 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
622 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
623 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
624 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
625
626 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
627 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
628
629 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
630
631 srcp += 2;
632 dstp += 2;
633 }
634 srcp += srcskip;
635 dstp += dstskip;
636 }
637 _mm_empty();
638 }
639}
640
641/* fast ARGB888->(A)RGB888 blending with pixel alpha */
642static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
643{
644 int width = info->d_width;
645 int height = info->d_height;
646 Uint32 *srcp = (Uint32 *)info->s_pixels;
647 int srcskip = info->s_skip >> 2;
648 Uint32 *dstp = (Uint32 *)info->d_pixels;
649 int dstskip = info->d_skip >> 2;
650 SDL_PixelFormat* sf = info->src;
651 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
652 Uint32 amask = sf->Amask;
653 Uint32 ashift = sf->Ashift;
654 Uint64 multmask;
655
656 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
657
658 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
659 multmask = ~(0xFFFFi64 << (ashift * 2));
660 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
661
662 while(height--) {
663 DUFFS_LOOP4({
664 Uint32 alpha = *srcp & amask;
665 if (alpha == 0) {
666 /* do nothing */
667 } else if (alpha == amask) {
668 /* opaque alpha -- copy RGB, keep dst alpha */
669 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
670 } else {
671 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
672 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
673
674 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
675 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
676
677 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
678 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
679 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
680 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
681 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
682
683 /* blend */
684 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
685 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
686 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
687 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
688 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
689
690 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
691 }
692 ++srcp;
693 ++dstp;
694 }, width);
695 srcp += srcskip;
696 dstp += dstskip;
697 }
698 _mm_empty();
699}
700/* End MSVC_ASMBLIT */
701
702#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
703
704#if SDL_ALTIVEC_BLITTERS
705#if __MWERKS__
706#pragma altivec_model on
707#endif
708#if HAVE_ALTIVEC_H
709#include <altivec.h>
710#endif
711#include <assert.h>
712
713#if (defined(__MACOSX__) && (__GNUC__ < 4))
714 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
715 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
716 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
717 (vector unsigned short) ( a,b,c,d,e,f,g,h )
718#else
719 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
720 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
721 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
722 (vector unsigned short) { a,b,c,d,e,f,g,h }
723#endif
724
725#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
726#define VECPRINT(msg, v) do { \
727 vector unsigned int tmpvec = (vector unsigned int)(v); \
728 unsigned int *vp = (unsigned int *)&tmpvec; \
729 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
730} while (0)
731
732/* the permuation vector that takes the high bytes out of all the appropriate shorts
733 (vector unsigned char)(
734 0x00, 0x10, 0x02, 0x12,
735 0x04, 0x14, 0x06, 0x16,
736 0x08, 0x18, 0x0A, 0x1A,
737 0x0C, 0x1C, 0x0E, 0x1E );
738*/
739#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
740#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
741#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
742#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
743 ? vec_lvsl(0, src) \
744 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
745
746
747#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
748 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
749 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
750 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
751 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
752 /* valpha2 is 255-alpha */ \
753 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
754 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
755 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
756 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
757 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
758 /* add source and dest */ \
759 vtemp1 = vec_add(vtemp1, vtemp3); \
760 vtemp2 = vec_add(vtemp2, vtemp4); \
761 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
762 vtemp1 = vec_add(vtemp1, v1_16); \
763 vtemp3 = vec_sr(vtemp1, v8_16); \
764 vtemp1 = vec_add(vtemp1, vtemp3); \
765 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
766 vtemp2 = vec_add(vtemp2, v1_16); \
767 vtemp4 = vec_sr(vtemp2, v8_16); \
768 vtemp2 = vec_add(vtemp2, vtemp4); \
769 /* (>>8) and get ARGBARGBARGBARGB */ \
770 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
771} while (0)
772
773/* Calculate the permute vector used for 32->32 swizzling */
774static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
775 const SDL_PixelFormat *dstfmt)
776{
777 /*
778 * We have to assume that the bits that aren't used by other
779 * colors is alpha, and it's one complete byte, since some formats
780 * leave alpha with a zero mask, but we should still swizzle the bits.
781 */
782 /* ARGB */
783 const static struct SDL_PixelFormat default_pixel_format = {
784 NULL, 0, 0,
785 0, 0, 0, 0,
786 16, 8, 0, 24,
787 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
788 0, 0};
789 if (!srcfmt) {
790 srcfmt = &default_pixel_format;
791 }
792 if (!dstfmt) {
793 dstfmt = &default_pixel_format;
794 }
795 const vector unsigned char plus = VECUINT8_LITERAL
796 ( 0x00, 0x00, 0x00, 0x00,
797 0x04, 0x04, 0x04, 0x04,
798 0x08, 0x08, 0x08, 0x08,
799 0x0C, 0x0C, 0x0C, 0x0C );
800 vector unsigned char vswiz;
801 vector unsigned int srcvec;
802#define RESHIFT(X) (3 - ((X) >> 3))
803 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
804 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
805 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
806 Uint32 amask;
807 /* Use zero for alpha if either surface doesn't have alpha */
808 if (dstfmt->Amask) {
809 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
810 } else {
811 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
812 }
813#undef RESHIFT
814 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
815 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
816 return(vswiz);
817}
818
819static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
820{
821 int height = info->d_height;
822 Uint8 *src = (Uint8 *)info->s_pixels;
823 int srcskip = info->s_skip;
824 Uint8 *dst = (Uint8 *)info->d_pixels;
825 int dstskip = info->d_skip;
826 SDL_PixelFormat *srcfmt = info->src;
827
828 vector unsigned char v0 = vec_splat_u8(0);
829 vector unsigned short v8_16 = vec_splat_u16(8);
830 vector unsigned short v1_16 = vec_splat_u16(1);
831 vector unsigned short v2_16 = vec_splat_u16(2);
832 vector unsigned short v3_16 = vec_splat_u16(3);
833 vector unsigned int v8_32 = vec_splat_u32(8);
834 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
835 vector unsigned short v3f = VECUINT16_LITERAL(
836 0x003f, 0x003f, 0x003f, 0x003f,
837 0x003f, 0x003f, 0x003f, 0x003f);
838 vector unsigned short vfc = VECUINT16_LITERAL(
839 0x00fc, 0x00fc, 0x00fc, 0x00fc,
840 0x00fc, 0x00fc, 0x00fc, 0x00fc);
841
842 /*
843 0x10 - 0x1f is the alpha
844 0x00 - 0x0e evens are the red
845 0x01 - 0x0f odds are zero
846 */
847 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
848 0x10, 0x00, 0x01, 0x01,
849 0x10, 0x02, 0x01, 0x01,
850 0x10, 0x04, 0x01, 0x01,
851 0x10, 0x06, 0x01, 0x01
852 );
853 vector unsigned char vredalpha2 = (vector unsigned char)(
854 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
855 );
856 /*
857 0x00 - 0x0f is ARxx ARxx ARxx ARxx
858 0x11 - 0x0f odds are blue
859 */
860 vector unsigned char vblue1 = VECUINT8_LITERAL(
861 0x00, 0x01, 0x02, 0x11,
862 0x04, 0x05, 0x06, 0x13,
863 0x08, 0x09, 0x0a, 0x15,
864 0x0c, 0x0d, 0x0e, 0x17
865 );
866 vector unsigned char vblue2 = (vector unsigned char)(
867 vec_add((vector unsigned int)vblue1, v8_32)
868 );
869 /*
870 0x00 - 0x0f is ARxB ARxB ARxB ARxB
871 0x10 - 0x0e evens are green
872 */
873 vector unsigned char vgreen1 = VECUINT8_LITERAL(
874 0x00, 0x01, 0x10, 0x03,
875 0x04, 0x05, 0x12, 0x07,
876 0x08, 0x09, 0x14, 0x0b,
877 0x0c, 0x0d, 0x16, 0x0f
878 );
879 vector unsigned char vgreen2 = (vector unsigned char)(
880 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
881 );
882 vector unsigned char vgmerge = VECUINT8_LITERAL(
883 0x00, 0x02, 0x00, 0x06,
884 0x00, 0x0a, 0x00, 0x0e,
885 0x00, 0x12, 0x00, 0x16,
886 0x00, 0x1a, 0x00, 0x1e);
887 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
888 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
889 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
890
891 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
892 vf800 = vec_sl(vf800, vec_splat_u16(8));
893
894 while(height--) {
895 int extrawidth;
896 vector unsigned char valigner;
897 vector unsigned char vsrc;
898 vector unsigned char voverflow;
899 int width = info->d_width;
900
901#define ONE_PIXEL_BLEND(condition, widthvar) \
902 while (condition) { \
903 Uint32 Pixel; \
904 unsigned sR, sG, sB, dR, dG, dB, sA; \
905 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
906 if(sA) { \
907 unsigned short dstpixel = *((unsigned short *)dst); \
908 dR = (dstpixel >> 8) & 0xf8; \
909 dG = (dstpixel >> 3) & 0xfc; \
910 dB = (dstpixel << 3) & 0xf8; \
911 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
912 *((unsigned short *)dst) = ( \
913 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
914 ); \
915 } \
916 src += 4; \
917 dst += 2; \
918 widthvar--; \
919 }
920 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
921 extrawidth = (width % 8);
922 valigner = VEC_ALIGNER(src);
923 vsrc = (vector unsigned char)vec_ld(0, src);
924 width -= extrawidth;
925 while (width) {
926 vector unsigned char valpha;
927 vector unsigned char vsrc1, vsrc2;
928 vector unsigned char vdst1, vdst2;
929 vector unsigned short vR, vG, vB;
930 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
931
932 /* Load 8 pixels from src as ARGB */
933 voverflow = (vector unsigned char)vec_ld(15, src);
934 vsrc = vec_perm(vsrc, voverflow, valigner);
935 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
936 src += 16;
937 vsrc = (vector unsigned char)vec_ld(15, src);
938 voverflow = vec_perm(voverflow, vsrc, valigner);
939 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
940 src += 16;
941
942 /* Load 8 pixels from dst as XRGB */
943 voverflow = vec_ld(0, dst);
944 vR = vec_and((vector unsigned short)voverflow, vf800);
945 vB = vec_sl((vector unsigned short)voverflow, v3_16);
946 vG = vec_sl(vB, v2_16);
947 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
948 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
949 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
950 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
951 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
952 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
953
954 /* Alpha blend 8 pixels as ARGB */
955 valpha = vec_perm(vsrc1, v0, valphaPermute);
956 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
957 valpha = vec_perm(vsrc2, v0, valphaPermute);
958 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
959
960 /* Convert 8 pixels to 565 */
961 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
962 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
963 vgpixel = vec_and(vgpixel, vfc);
964 vgpixel = vec_sl(vgpixel, v3_16);
965 vrpixel = vec_sl(vpixel, v1_16);
966 vrpixel = vec_and(vrpixel, vf800);
967 vbpixel = vec_and(vpixel, v3f);
968 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
969 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
970
971 /* Store 8 pixels */
972 vec_st(vdst1, 0, dst);
973
974 width -= 8;
975 dst += 16;
976 }
977 ONE_PIXEL_BLEND((extrawidth), extrawidth);
978#undef ONE_PIXEL_BLEND
979 src += srcskip;
980 dst += dstskip;
981 }
982}
983
984static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
985{
986 unsigned alpha = info->src->alpha;
987 int height = info->d_height;
988 Uint32 *srcp = (Uint32 *)info->s_pixels;
989 int srcskip = info->s_skip >> 2;
990 Uint32 *dstp = (Uint32 *)info->d_pixels;
991 int dstskip = info->d_skip >> 2;
992 SDL_PixelFormat *srcfmt = info->src;
993 SDL_PixelFormat *dstfmt = info->dst;
994 unsigned sA = srcfmt->alpha;
995 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
996 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
997 Uint32 ckey = info->src->colorkey;
998 vector unsigned char mergePermute;
999 vector unsigned char vsrcPermute;
1000 vector unsigned char vdstPermute;
1001 vector unsigned char vsdstPermute;
1002 vector unsigned char valpha;
1003 vector unsigned char valphamask;
1004 vector unsigned char vbits;
1005 vector unsigned char v0;
1006 vector unsigned short v1;
1007 vector unsigned short v8;
1008 vector unsigned int vckey;
1009 vector unsigned int vrgbmask;
1010
1011 mergePermute = VEC_MERGE_PERMUTE();
1012 v0 = vec_splat_u8(0);
1013 v1 = vec_splat_u16(1);
1014 v8 = vec_splat_u16(8);
1015
1016 /* set the alpha to 255 on the destination surf */
1017 valphamask = VEC_ALPHA_MASK();
1018
1019 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1020 vdstPermute = calc_swizzle32(NULL, dstfmt);
1021 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1022
1023 /* set a vector full of alpha and 255-alpha */
1024 ((unsigned char *)&valpha)[0] = alpha;
1025 valpha = vec_splat(valpha, 0);
1026 vbits = (vector unsigned char)vec_splat_s8(-1);
1027
1028 ckey &= rgbmask;
1029 ((unsigned int *)(char*)&vckey)[0] = ckey;
1030 vckey = vec_splat(vckey, 0);
1031 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1032 vrgbmask = vec_splat(vrgbmask, 0);
1033
1034 while(height--) {
1035 int width = info->d_width;
1036#define ONE_PIXEL_BLEND(condition, widthvar) \
1037 while (condition) { \
1038 Uint32 Pixel; \
1039 unsigned sR, sG, sB, dR, dG, dB; \
1040 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1041 if(sA && Pixel != ckey) { \
1042 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1043 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1044 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1045 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1046 } \
1047 dstp++; \
1048 srcp++; \
1049 widthvar--; \
1050 }
1051 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1052 if (width > 0) {
1053 int extrawidth = (width % 4);
1054 vector unsigned char valigner = VEC_ALIGNER(srcp);
1055 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1056 width -= extrawidth;
1057 while (width) {
1058 vector unsigned char vsel;
1059 vector unsigned char voverflow;
1060 vector unsigned char vd;
1061 vector unsigned char vd_orig;
1062
1063 /* s = *srcp */
1064 voverflow = (vector unsigned char)vec_ld(15, srcp);
1065 vs = vec_perm(vs, voverflow, valigner);
1066
1067 /* vsel is set for items that match the key */
1068 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1069 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1070
1071 /* permute to source format */
1072 vs = vec_perm(vs, valpha, vsrcPermute);
1073
1074 /* d = *dstp */
1075 vd = (vector unsigned char)vec_ld(0, dstp);
1076 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1077
1078 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1079
1080 /* set the alpha channel to full on */
1081 vd = vec_or(vd, valphamask);
1082
1083 /* mask out color key */
1084 vd = vec_sel(vd, vd_orig, vsel);
1085
1086 /* permute to dest format */
1087 vd = vec_perm(vd, vbits, vdstPermute);
1088
1089 /* *dstp = res */
1090 vec_st((vector unsigned int)vd, 0, dstp);
1091
1092 srcp += 4;
1093 dstp += 4;
1094 width -= 4;
1095 vs = voverflow;
1096 }
1097 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1098 }
1099#undef ONE_PIXEL_BLEND
1100
1101 srcp += srcskip;
1102 dstp += dstskip;
1103 }
1104}
1105
1106
1107static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1108{
1109 int width = info->d_width;
1110 int height = info->d_height;
1111 Uint32 *srcp = (Uint32 *)info->s_pixels;
1112 int srcskip = info->s_skip >> 2;
1113 Uint32 *dstp = (Uint32 *)info->d_pixels;
1114 int dstskip = info->d_skip >> 2;
1115 SDL_PixelFormat *srcfmt = info->src;
1116 SDL_PixelFormat *dstfmt = info->dst;
1117 vector unsigned char mergePermute;
1118 vector unsigned char valphaPermute;
1119 vector unsigned char vsrcPermute;
1120 vector unsigned char vdstPermute;
1121 vector unsigned char vsdstPermute;
1122 vector unsigned char valphamask;
1123 vector unsigned char vpixelmask;
1124 vector unsigned char v0;
1125 vector unsigned short v1;
1126 vector unsigned short v8;
1127
1128 v0 = vec_splat_u8(0);
1129 v1 = vec_splat_u16(1);
1130 v8 = vec_splat_u16(8);
1131 mergePermute = VEC_MERGE_PERMUTE();
1132 valphamask = VEC_ALPHA_MASK();
1133 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1134 vpixelmask = vec_nor(valphamask, v0);
1135 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1136 vdstPermute = calc_swizzle32(NULL, dstfmt);
1137 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1138
1139 while ( height-- ) {
1140 width = info->d_width;
1141#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1142 Uint32 Pixel; \
1143 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1144 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1145 if(sA) { \
1146 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1147 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1148 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1149 } \
1150 ++srcp; \
1151 ++dstp; \
1152 widthvar--; \
1153 }
1154 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1155 if (width > 0) {
1156 /* vsrcPermute */
1157 /* vdstPermute */
1158 int extrawidth = (width % 4);
1159 vector unsigned char valigner = VEC_ALIGNER(srcp);
1160 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1161 width -= extrawidth;
1162 while (width) {
1163 vector unsigned char voverflow;
1164 vector unsigned char vd;
1165 vector unsigned char valpha;
1166 vector unsigned char vdstalpha;
1167 /* s = *srcp */
1168 voverflow = (vector unsigned char)vec_ld(15, srcp);
1169 vs = vec_perm(vs, voverflow, valigner);
1170 vs = vec_perm(vs, v0, vsrcPermute);
1171
1172 valpha = vec_perm(vs, v0, valphaPermute);
1173
1174 /* d = *dstp */
1175 vd = (vector unsigned char)vec_ld(0, dstp);
1176 vd = vec_perm(vd, v0, vsdstPermute);
1177 vdstalpha = vec_and(vd, valphamask);
1178
1179 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1180
1181 /* set the alpha to the dest alpha */
1182 vd = vec_and(vd, vpixelmask);
1183 vd = vec_or(vd, vdstalpha);
1184 vd = vec_perm(vd, v0, vdstPermute);
1185
1186 /* *dstp = res */
1187 vec_st((vector unsigned int)vd, 0, dstp);
1188
1189 srcp += 4;
1190 dstp += 4;
1191 width -= 4;
1192 vs = voverflow;
1193
1194 }
1195 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1196 }
1197 srcp += srcskip;
1198 dstp += dstskip;
1199#undef ONE_PIXEL_BLEND
1200 }
1201}
1202
1203/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1204static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1205{
1206 int width = info->d_width;
1207 int height = info->d_height;
1208 Uint32 *srcp = (Uint32 *)info->s_pixels;
1209 int srcskip = info->s_skip >> 2;
1210 Uint32 *dstp = (Uint32 *)info->d_pixels;
1211 int dstskip = info->d_skip >> 2;
1212 vector unsigned char mergePermute;
1213 vector unsigned char valphaPermute;
1214 vector unsigned char valphamask;
1215 vector unsigned char vpixelmask;
1216 vector unsigned char v0;
1217 vector unsigned short v1;
1218 vector unsigned short v8;
1219 v0 = vec_splat_u8(0);
1220 v1 = vec_splat_u16(1);
1221 v8 = vec_splat_u16(8);
1222 mergePermute = VEC_MERGE_PERMUTE();
1223 valphamask = VEC_ALPHA_MASK();
1224 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1225
1226
1227 vpixelmask = vec_nor(valphamask, v0);
1228 while(height--) {
1229 width = info->d_width;
1230#define ONE_PIXEL_BLEND(condition, widthvar) \
1231 while ((condition)) { \
1232 Uint32 dalpha; \
1233 Uint32 d; \
1234 Uint32 s1; \
1235 Uint32 d1; \
1236 Uint32 s = *srcp; \
1237 Uint32 alpha = s >> 24; \
1238 if(alpha) { \
1239 if(alpha == SDL_ALPHA_OPAQUE) { \
1240 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1241 } else { \
1242 d = *dstp; \
1243 dalpha = d & 0xff000000; \
1244 s1 = s & 0xff00ff; \
1245 d1 = d & 0xff00ff; \
1246 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1247 s &= 0xff00; \
1248 d &= 0xff00; \
1249 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1250 *dstp = d1 | d | dalpha; \
1251 } \
1252 } \
1253 ++srcp; \
1254 ++dstp; \
1255 widthvar--; \
1256 }
1257 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1258 if (width > 0) {
1259 int extrawidth = (width % 4);
1260 vector unsigned char valigner = VEC_ALIGNER(srcp);
1261 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1262 width -= extrawidth;
1263 while (width) {
1264 vector unsigned char voverflow;
1265 vector unsigned char vd;
1266 vector unsigned char valpha;
1267 vector unsigned char vdstalpha;
1268 /* s = *srcp */
1269 voverflow = (vector unsigned char)vec_ld(15, srcp);
1270 vs = vec_perm(vs, voverflow, valigner);
1271
1272 valpha = vec_perm(vs, v0, valphaPermute);
1273
1274 /* d = *dstp */
1275 vd = (vector unsigned char)vec_ld(0, dstp);
1276 vdstalpha = vec_and(vd, valphamask);
1277
1278 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1279
1280 /* set the alpha to the dest alpha */
1281 vd = vec_and(vd, vpixelmask);
1282 vd = vec_or(vd, vdstalpha);
1283
1284 /* *dstp = res */
1285 vec_st((vector unsigned int)vd, 0, dstp);
1286
1287 srcp += 4;
1288 dstp += 4;
1289 width -= 4;
1290 vs = voverflow;
1291 }
1292 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1293 }
1294 srcp += srcskip;
1295 dstp += dstskip;
1296 }
1297#undef ONE_PIXEL_BLEND
1298}
1299
1300static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1301{
1302 /* XXX : 6 */
1303 unsigned alpha = info->src->alpha;
1304 int height = info->d_height;
1305 Uint32 *srcp = (Uint32 *)info->s_pixels;
1306 int srcskip = info->s_skip >> 2;
1307 Uint32 *dstp = (Uint32 *)info->d_pixels;
1308 int dstskip = info->d_skip >> 2;
1309 SDL_PixelFormat *srcfmt = info->src;
1310 SDL_PixelFormat *dstfmt = info->dst;
1311 unsigned sA = srcfmt->alpha;
1312 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1313 vector unsigned char mergePermute;
1314 vector unsigned char vsrcPermute;
1315 vector unsigned char vdstPermute;
1316 vector unsigned char vsdstPermute;
1317 vector unsigned char valpha;
1318 vector unsigned char valphamask;
1319 vector unsigned char vbits;
1320 vector unsigned short v1;
1321 vector unsigned short v8;
1322
1323 mergePermute = VEC_MERGE_PERMUTE();
1324 v1 = vec_splat_u16(1);
1325 v8 = vec_splat_u16(8);
1326
1327 /* set the alpha to 255 on the destination surf */
1328 valphamask = VEC_ALPHA_MASK();
1329
1330 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1331 vdstPermute = calc_swizzle32(NULL, dstfmt);
1332 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1333
1334 /* set a vector full of alpha and 255-alpha */
1335 ((unsigned char *)&valpha)[0] = alpha;
1336 valpha = vec_splat(valpha, 0);
1337 vbits = (vector unsigned char)vec_splat_s8(-1);
1338
1339 while(height--) {
1340 int width = info->d_width;
1341#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1342 Uint32 Pixel; \
1343 unsigned sR, sG, sB, dR, dG, dB; \
1344 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1345 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1346 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1347 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1348 ++srcp; \
1349 ++dstp; \
1350 widthvar--; \
1351 }
1352 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1353 if (width > 0) {
1354 int extrawidth = (width % 4);
1355 vector unsigned char valigner = VEC_ALIGNER(srcp);
1356 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1357 width -= extrawidth;
1358 while (width) {
1359 vector unsigned char voverflow;
1360 vector unsigned char vd;
1361
1362 /* s = *srcp */
1363 voverflow = (vector unsigned char)vec_ld(15, srcp);
1364 vs = vec_perm(vs, voverflow, valigner);
1365 vs = vec_perm(vs, valpha, vsrcPermute);
1366
1367 /* d = *dstp */
1368 vd = (vector unsigned char)vec_ld(0, dstp);
1369 vd = vec_perm(vd, vd, vsdstPermute);
1370
1371 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1372
1373 /* set the alpha channel to full on */
1374 vd = vec_or(vd, valphamask);
1375 vd = vec_perm(vd, vbits, vdstPermute);
1376
1377 /* *dstp = res */
1378 vec_st((vector unsigned int)vd, 0, dstp);
1379
1380 srcp += 4;
1381 dstp += 4;
1382 width -= 4;
1383 vs = voverflow;
1384 }
1385 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1386 }
1387#undef ONE_PIXEL_BLEND
1388
1389 srcp += srcskip;
1390 dstp += dstskip;
1391 }
1392
1393}
1394
1395
1396/* fast RGB888->(A)RGB888 blending */
1397static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1398{
1399 unsigned alpha = info->src->alpha;
1400 int height = info->d_height;
1401 Uint32 *srcp = (Uint32 *)info->s_pixels;
1402 int srcskip = info->s_skip >> 2;
1403 Uint32 *dstp = (Uint32 *)info->d_pixels;
1404 int dstskip = info->d_skip >> 2;
1405 vector unsigned char mergePermute;
1406 vector unsigned char valpha;
1407 vector unsigned char valphamask;
1408 vector unsigned short v1;
1409 vector unsigned short v8;
1410
1411 mergePermute = VEC_MERGE_PERMUTE();
1412 v1 = vec_splat_u16(1);
1413 v8 = vec_splat_u16(8);
1414
1415 /* set the alpha to 255 on the destination surf */
1416 valphamask = VEC_ALPHA_MASK();
1417
1418 /* set a vector full of alpha and 255-alpha */
1419 ((unsigned char *)&valpha)[0] = alpha;
1420 valpha = vec_splat(valpha, 0);
1421
1422 while(height--) {
1423 int width = info->d_width;
1424#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1425 Uint32 s = *srcp; \
1426 Uint32 d = *dstp; \
1427 Uint32 s1 = s & 0xff00ff; \
1428 Uint32 d1 = d & 0xff00ff; \
1429 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1430 & 0xff00ff; \
1431 s &= 0xff00; \
1432 d &= 0xff00; \
1433 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1434 *dstp = d1 | d | 0xff000000; \
1435 ++srcp; \
1436 ++dstp; \
1437 widthvar--; \
1438 }
1439 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1440 if (width > 0) {
1441 int extrawidth = (width % 4);
1442 vector unsigned char valigner = VEC_ALIGNER(srcp);
1443 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1444 width -= extrawidth;
1445 while (width) {
1446 vector unsigned char voverflow;
1447 vector unsigned char vd;
1448
1449 /* s = *srcp */
1450 voverflow = (vector unsigned char)vec_ld(15, srcp);
1451 vs = vec_perm(vs, voverflow, valigner);
1452
1453 /* d = *dstp */
1454 vd = (vector unsigned char)vec_ld(0, dstp);
1455
1456 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1457
1458 /* set the alpha channel to full on */
1459 vd = vec_or(vd, valphamask);
1460
1461 /* *dstp = res */
1462 vec_st((vector unsigned int)vd, 0, dstp);
1463
1464 srcp += 4;
1465 dstp += 4;
1466 width -= 4;
1467 vs = voverflow;
1468 }
1469 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1470 }
1471#undef ONE_PIXEL_BLEND
1472
1473 srcp += srcskip;
1474 dstp += dstskip;
1475 }
1476}
1477#if __MWERKS__
1478#pragma altivec_model off
1479#endif
1480#endif /* SDL_ALTIVEC_BLITTERS */
1481
1482/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1483static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1484{
1485 int width = info->d_width;
1486 int height = info->d_height;
1487 Uint32 *srcp = (Uint32 *)info->s_pixels;
1488 int srcskip = info->s_skip >> 2;
1489 Uint32 *dstp = (Uint32 *)info->d_pixels;
1490 int dstskip = info->d_skip >> 2;
1491
1492 while(height--) {
1493 DUFFS_LOOP4({
1494 Uint32 s = *srcp++;
1495 Uint32 d = *dstp;
1496 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1497 + (s & d & 0x00010101)) | 0xff000000;
1498 }, width);
1499 srcp += srcskip;
1500 dstp += dstskip;
1501 }
1502}
1503
1504/* fast RGB888->(A)RGB888 blending with surface alpha */
1505static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1506{
1507 unsigned alpha = info->src->alpha;
1508 if(alpha == 128) {
1509 BlitRGBtoRGBSurfaceAlpha128(info);
1510 } else {
1511 int width = info->d_width;
1512 int height = info->d_height;
1513 Uint32 *srcp = (Uint32 *)info->s_pixels;
1514 int srcskip = info->s_skip >> 2;
1515 Uint32 *dstp = (Uint32 *)info->d_pixels;
1516 int dstskip = info->d_skip >> 2;
1517 Uint32 s;
1518 Uint32 d;
1519 Uint32 s1;
1520 Uint32 d1;
1521
1522 while(height--) {
1523 DUFFS_LOOP_DOUBLE2({
1524 /* One Pixel Blend */
1525 s = *srcp;
1526 d = *dstp;
1527 s1 = s & 0xff00ff;
1528 d1 = d & 0xff00ff;
1529 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1530 & 0xff00ff;
1531 s &= 0xff00;
1532 d &= 0xff00;
1533 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1534 *dstp = d1 | d | 0xff000000;
1535 ++srcp;
1536 ++dstp;
1537 },{
1538 /* Two Pixels Blend */
1539 s = *srcp;
1540 d = *dstp;
1541 s1 = s & 0xff00ff;
1542 d1 = d & 0xff00ff;
1543 d1 += (s1 - d1) * alpha >> 8;
1544 d1 &= 0xff00ff;
1545
1546 s = ((s & 0xff00) >> 8) |
1547 ((srcp[1] & 0xff00) << 8);
1548 d = ((d & 0xff00) >> 8) |
1549 ((dstp[1] & 0xff00) << 8);
1550 d += (s - d) * alpha >> 8;
1551 d &= 0x00ff00ff;
1552
1553 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1554 ++srcp;
1555
1556 s1 = *srcp;
1557 d1 = *dstp;
1558 s1 &= 0xff00ff;
1559 d1 &= 0xff00ff;
1560 d1 += (s1 - d1) * alpha >> 8;
1561 d1 &= 0xff00ff;
1562
1563 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1564 ++srcp;
1565 ++dstp;
1566 }, width);
1567 srcp += srcskip;
1568 dstp += dstskip;
1569 }
1570 }
1571}
1572
1573/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1574static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1575{
1576 int width = info->d_width;
1577 int height = info->d_height;
1578 Uint32 *srcp = (Uint32 *)info->s_pixels;
1579 int srcskip = info->s_skip >> 2;
1580 Uint32 *dstp = (Uint32 *)info->d_pixels;
1581 int dstskip = info->d_skip >> 2;
1582
1583 while(height--) {
1584 DUFFS_LOOP4({
1585 Uint32 dalpha;
1586 Uint32 d;
1587 Uint32 s1;
1588 Uint32 d1;
1589 Uint32 s = *srcp;
1590 Uint32 alpha = s >> 24;
1591 /* FIXME: Here we special-case opaque alpha since the
1592 compositioning used (>>8 instead of /255) doesn't handle
1593 it correctly. Also special-case alpha=0 for speed?
1594 Benchmark this! */
1595 if(alpha) {
1596 if(alpha == SDL_ALPHA_OPAQUE) {
1597 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1598 } else {
1599 /*
1600 * take out the middle component (green), and process
1601 * the other two in parallel. One multiply less.
1602 */
1603 d = *dstp;
1604 dalpha = d & 0xff000000;
1605 s1 = s & 0xff00ff;
1606 d1 = d & 0xff00ff;
1607 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1608 s &= 0xff00;
1609 d &= 0xff00;
1610 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1611 *dstp = d1 | d | dalpha;
1612 }
1613 }
1614 ++srcp;
1615 ++dstp;
1616 }, width);
1617 srcp += srcskip;
1618 dstp += dstskip;
1619 }
1620}
1621
1622#if GCC_ASMBLIT
1623/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1624static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1625{
1626 int width = info->d_width;
1627 int height = info->d_height;
1628 Uint32 *srcp = (Uint32 *)info->s_pixels;
1629 int srcskip = info->s_skip >> 2;
1630 Uint32 *dstp = (Uint32 *)info->d_pixels;
1631 int dstskip = info->d_skip >> 2;
1632 SDL_PixelFormat* sf = info->src;
1633 Uint32 amask = sf->Amask;
1634
1635 __asm__ (
1636 /* make mm6 all zeros. */
1637 "pxor %%mm6, %%mm6\n"
1638
1639 /* Make a mask to preserve the alpha. */
1640 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1641 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1642 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1643 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1644 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1645
1646 /* form channel masks */
1647 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1648 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1649 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1650 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1651
1652 /* get alpha channel shift */
1653 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1654
1655 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1656
1657 while(height--) {
1658
1659 DUFFS_LOOP4({
1660 Uint32 alpha;
1661
1662 __asm__ (
1663 "prefetch 64(%0)\n"
1664 "prefetch 64(%1)\n"
1665 : : "r" (srcp), "r" (dstp) );
1666
1667 alpha = *srcp & amask;
1668 /* FIXME: Here we special-case opaque alpha since the
1669 compositioning used (>>8 instead of /255) doesn't handle
1670 it correctly. Also special-case alpha=0 for speed?
1671 Benchmark this! */
1672 if(alpha == 0) {
1673 /* do nothing */
1674 }
1675 else if(alpha == amask) {
1676 /* opaque alpha -- copy RGB, keep dst alpha */
1677 /* using MMX here to free up regular registers for other things */
1678 __asm__ (
1679 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1680 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1681 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1682 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1683 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1684 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1685
1686 : : "r" (srcp), "r" (dstp) );
1687 }
1688
1689 else {
1690 __asm__ (
1691 /* load in the source, and dst. */
1692 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1693 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1694
1695 /* Move the src alpha into mm2 */
1696
1697 /* if supporting pshufw */
1698 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1699 /*"psrlw $8, %%mm2\n" */
1700
1701 /* else: */
1702 "movd %2, %%mm2\n"
1703 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1704 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1705 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1706 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1707
1708 /* move the colors into words. */
1709 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1710 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1711
1712 /* src - dst */
1713 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1714
1715 /* A * (src-dst) */
1716 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1717 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1718 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1719
1720 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1721
1722 "movd %%mm0, (%1)\n" /* result in mm0 */
1723
1724 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1725
1726 }
1727 ++srcp;
1728 ++dstp;
1729 }, width);
1730 srcp += srcskip;
1731 dstp += dstskip;
1732 }
1733
1734 __asm__ (
1735 "emms\n"
1736 : );
1737}
1738/* End GCC_ASMBLIT*/
1739
1740#elif MSVC_ASMBLIT
1741/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1742static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1743{
1744 int width = info->d_width;
1745 int height = info->d_height;
1746 Uint32 *srcp = (Uint32 *)info->s_pixels;
1747 int srcskip = info->s_skip >> 2;
1748 Uint32 *dstp = (Uint32 *)info->d_pixels;
1749 int dstskip = info->d_skip >> 2;
1750 SDL_PixelFormat* sf = info->src;
1751 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1752 Uint32 amask = sf->Amask;
1753 Uint32 ashift = sf->Ashift;
1754 Uint64 multmask;
1755
1756 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1757
1758 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1759 multmask = ~(0xFFFFi64 << (ashift * 2));
1760 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1761
1762 while(height--) {
1763 DUFFS_LOOP4({
1764 Uint32 alpha;
1765
1766 _m_prefetch(srcp + 16);
1767 _m_prefetch(dstp + 16);
1768
1769 alpha = *srcp & amask;
1770 if (alpha == 0) {
1771 /* do nothing */
1772 } else if (alpha == amask) {
1773 /* copy RGB, keep dst alpha */
1774 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1775 } else {
1776 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1777 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1778
1779 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1780 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1781
1782 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1783 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1784 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1785 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1786 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1787
1788 /* blend */
1789 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1790 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1791 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1792 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1793 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1794
1795 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1796 }
1797 ++srcp;
1798 ++dstp;
1799 }, width);
1800 srcp += srcskip;
1801 dstp += dstskip;
1802 }
1803 _mm_empty();
1804}
1805/* End MSVC_ASMBLIT */
1806
1807#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1808
1809/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1810
1811/* blend a single 16 bit pixel at 50% */
1812#define BLEND16_50(d, s, mask) \
1813 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1814
1815/* blend two 16 bit pixels at 50% */
1816#define BLEND2x16_50(d, s, mask) \
1817 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1818 + (s & d & (~(mask | mask << 16))))
1819
1820static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1821{
1822 int width = info->d_width;
1823 int height = info->d_height;
1824 Uint16 *srcp = (Uint16 *)info->s_pixels;
1825 int srcskip = info->s_skip >> 1;
1826 Uint16 *dstp = (Uint16 *)info->d_pixels;
1827 int dstskip = info->d_skip >> 1;
1828
1829 while(height--) {
1830 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1831 /*
1832 * Source and destination not aligned, pipeline it.
1833 * This is mostly a win for big blits but no loss for
1834 * small ones
1835 */
1836 Uint32 prev_sw;
1837 int w = width;
1838
1839 /* handle odd destination */
1840 if((uintptr_t)dstp & 2) {
1841 Uint16 d = *dstp, s = *srcp;
1842 *dstp = BLEND16_50(d, s, mask);
1843 dstp++;
1844 srcp++;
1845 w--;
1846 }
1847 srcp++; /* srcp is now 32-bit aligned */
1848
1849 /* bootstrap pipeline with first halfword */
1850 prev_sw = ((Uint32 *)srcp)[-1];
1851
1852 while(w > 1) {
1853 Uint32 sw, dw, s;
1854 sw = *(Uint32 *)srcp;
1855 dw = *(Uint32 *)dstp;
1856#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1857 s = (prev_sw << 16) + (sw >> 16);
1858#else
1859 s = (prev_sw >> 16) + (sw << 16);
1860#endif
1861 prev_sw = sw;
1862 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1863 dstp += 2;
1864 srcp += 2;
1865 w -= 2;
1866 }
1867
1868 /* final pixel if any */
1869 if(w) {
1870 Uint16 d = *dstp, s;
1871#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1872 s = (Uint16)prev_sw;
1873#else
1874 s = (Uint16)(prev_sw >> 16);
1875#endif
1876 *dstp = BLEND16_50(d, s, mask);
1877 srcp++;
1878 dstp++;
1879 }
1880 srcp += srcskip - 1;
1881 dstp += dstskip;
1882 } else {
1883 /* source and destination are aligned */
1884 int w = width;
1885
1886 /* first odd pixel? */
1887 if((uintptr_t)srcp & 2) {
1888 Uint16 d = *dstp, s = *srcp;
1889 *dstp = BLEND16_50(d, s, mask);
1890 srcp++;
1891 dstp++;
1892 w--;
1893 }
1894 /* srcp and dstp are now 32-bit aligned */
1895
1896 while(w > 1) {
1897 Uint32 sw = *(Uint32 *)srcp;
1898 Uint32 dw = *(Uint32 *)dstp;
1899 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1900 srcp += 2;
1901 dstp += 2;
1902 w -= 2;
1903 }
1904
1905 /* last odd pixel? */
1906 if(w) {
1907 Uint16 d = *dstp, s = *srcp;
1908 *dstp = BLEND16_50(d, s, mask);
1909 srcp++;
1910 dstp++;
1911 }
1912 srcp += srcskip;
1913 dstp += dstskip;
1914 }
1915 }
1916}
1917
1918#if GCC_ASMBLIT
1919/* fast RGB565->RGB565 blending with surface alpha */
1920static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1921{
1922 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1923 if(alpha == 128) {
1924 Blit16to16SurfaceAlpha128(info, 0xf7de);
1925 } else {
1926 int width = info->d_width;
1927 int height = info->d_height;
1928 Uint16 *srcp = (Uint16 *)info->s_pixels;
1929 int srcskip = info->s_skip >> 1;
1930 Uint16 *dstp = (Uint16 *)info->d_pixels;
1931 int dstskip = info->d_skip >> 1;
1932 Uint32 s, d;
1933 Uint64 load;
1934
1935 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1936 load = alpha;
1937 alpha >>= 3; /* downscale alpha to 5 bits */
1938
1939 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1940 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1941 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1942 /* position alpha to allow for mullo and mulhi on diff channels
1943 to reduce the number of operations */
1944 psllq_i2r(3, mm0);
1945
1946 /* Setup the 565 color channel masks */
1947 load = 0x07E007E007E007E0ULL;
1948 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1949 load = 0x001F001F001F001FULL;
1950 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1951 while(height--) {
1952 DUFFS_LOOP_QUATRO2(
1953 {
1954 s = *srcp++;
1955 d = *dstp;
1956 /*
1957 * shift out the middle component (green) to
1958 * the high 16 bits, and process all three RGB
1959 * components at the same time.
1960 */
1961 s = (s | s << 16) & 0x07e0f81f;
1962 d = (d | d << 16) & 0x07e0f81f;
1963 d += (s - d) * alpha >> 5;
1964 d &= 0x07e0f81f;
1965 *dstp++ = d | d >> 16;
1966 },{
1967 s = *srcp++;
1968 d = *dstp;
1969 /*
1970 * shift out the middle component (green) to
1971 * the high 16 bits, and process all three RGB
1972 * components at the same time.
1973 */
1974 s = (s | s << 16) & 0x07e0f81f;
1975 d = (d | d << 16) & 0x07e0f81f;
1976 d += (s - d) * alpha >> 5;
1977 d &= 0x07e0f81f;
1978 *dstp++ = d | d >> 16;
1979 s = *srcp++;
1980 d = *dstp;
1981 /*
1982 * shift out the middle component (green) to
1983 * the high 16 bits, and process all three RGB
1984 * components at the same time.
1985 */
1986 s = (s | s << 16) & 0x07e0f81f;
1987 d = (d | d << 16) & 0x07e0f81f;
1988 d += (s - d) * alpha >> 5;
1989 d &= 0x07e0f81f;
1990 *dstp++ = d | d >> 16;
1991 },{
1992 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1993 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1994
1995 /* red -- does not need a mask since the right shift clears
1996 the uninteresting bits */
1997 movq_r2r(mm2, mm5); /* src -> mm5 */
1998 movq_r2r(mm3, mm6); /* dst -> mm6 */
1999 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2000 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2001
2002 /* blend */
2003 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2004 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2005 /* alpha used is actually 11 bits
2006 11 + 5 = 16 bits, so the sign bits are lost */
2007 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2008 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2009 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2010
2011 movq_r2r(mm6, mm1); /* save new reds in dsts */
2012
2013 /* green -- process the bits in place */
2014 movq_r2r(mm2, mm5); /* src -> mm5 */
2015 movq_r2r(mm3, mm6); /* dst -> mm6 */
2016 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2017 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2018
2019 /* blend */
2020 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2021 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2022 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2023 bits are gone and the sign bits present */
2024 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2025 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2026
2027 por_r2r(mm6, mm1); /* save new greens in dsts */
2028
2029 /* blue */
2030 movq_r2r(mm2, mm5); /* src -> mm5 */
2031 movq_r2r(mm3, mm6); /* dst -> mm6 */
2032 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2033 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2034
2035 /* blend */
2036 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2037 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2038 /* 11 + 5 = 16 bits, so the sign bits are lost and
2039 the interesting bits will need to be MASKed */
2040 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2041 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2042 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2043
2044 por_r2r(mm6, mm1); /* save new blues in dsts */
2045
2046 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2047
2048 srcp += 4;
2049 dstp += 4;
2050 }, width);
2051 srcp += srcskip;
2052 dstp += dstskip;
2053 }
2054 emms();
2055 }
2056}
2057
2058/* fast RGB555->RGB555 blending with surface alpha */
2059static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2060{
2061 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2062 if(alpha == 128) {
2063 Blit16to16SurfaceAlpha128(info, 0xfbde);
2064 } else {
2065 int width = info->d_width;
2066 int height = info->d_height;
2067 Uint16 *srcp = (Uint16 *)info->s_pixels;
2068 int srcskip = info->s_skip >> 1;
2069 Uint16 *dstp = (Uint16 *)info->d_pixels;
2070 int dstskip = info->d_skip >> 1;
2071 Uint32 s, d;
2072 Uint64 load;
2073
2074 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2075 load = alpha;
2076 alpha >>= 3; /* downscale alpha to 5 bits */
2077
2078 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2079 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2080 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2081 /* position alpha to allow for mullo and mulhi on diff channels
2082 to reduce the number of operations */
2083 psllq_i2r(3, mm0);
2084
2085 /* Setup the 555 color channel masks */
2086 load = 0x03E003E003E003E0ULL;
2087 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2088 load = 0x001F001F001F001FULL;
2089 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2090 while(height--) {
2091 DUFFS_LOOP_QUATRO2(
2092 {
2093 s = *srcp++;
2094 d = *dstp;
2095 /*
2096 * shift out the middle component (green) to
2097 * the high 16 bits, and process all three RGB
2098 * components at the same time.
2099 */
2100 s = (s | s << 16) & 0x03e07c1f;
2101 d = (d | d << 16) & 0x03e07c1f;
2102 d += (s - d) * alpha >> 5;
2103 d &= 0x03e07c1f;
2104 *dstp++ = d | d >> 16;
2105 },{
2106 s = *srcp++;
2107 d = *dstp;
2108 /*
2109 * shift out the middle component (green) to
2110 * the high 16 bits, and process all three RGB
2111 * components at the same time.
2112 */
2113 s = (s | s << 16) & 0x03e07c1f;
2114 d = (d | d << 16) & 0x03e07c1f;
2115 d += (s - d) * alpha >> 5;
2116 d &= 0x03e07c1f;
2117 *dstp++ = d | d >> 16;
2118 s = *srcp++;
2119 d = *dstp;
2120 /*
2121 * shift out the middle component (green) to
2122 * the high 16 bits, and process all three RGB
2123 * components at the same time.
2124 */
2125 s = (s | s << 16) & 0x03e07c1f;
2126 d = (d | d << 16) & 0x03e07c1f;
2127 d += (s - d) * alpha >> 5;
2128 d &= 0x03e07c1f;
2129 *dstp++ = d | d >> 16;
2130 },{
2131 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2132 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2133
2134 /* red -- process the bits in place */
2135 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2136 /* by reusing the GREEN mask we free up another mmx
2137 register to accumulate the result */
2138
2139 movq_r2r(mm2, mm5); /* src -> mm5 */
2140 movq_r2r(mm3, mm6); /* dst -> mm6 */
2141 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2142 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2143
2144 /* blend */
2145 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2146 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2147 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2148 cleared by a MASK below */
2149 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2150 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2151 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2152
2153 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2154
2155 movq_r2r(mm6, mm1); /* save new reds in dsts */
2156
2157 /* green -- process the bits in place */
2158 movq_r2r(mm2, mm5); /* src -> mm5 */
2159 movq_r2r(mm3, mm6); /* dst -> mm6 */
2160 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2161 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2162
2163 /* blend */
2164 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2165 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2166 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2167 bits are gone and the sign bits present */
2168 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2169 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2170
2171 por_r2r(mm6, mm1); /* save new greens in dsts */
2172
2173 /* blue */
2174 movq_r2r(mm2, mm5); /* src -> mm5 */
2175 movq_r2r(mm3, mm6); /* dst -> mm6 */
2176 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2177 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2178
2179 /* blend */
2180 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2181 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2182 /* 11 + 5 = 16 bits, so the sign bits are lost and
2183 the interesting bits will need to be MASKed */
2184 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2185 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2186 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2187
2188 por_r2r(mm6, mm1); /* save new blues in dsts */
2189
2190 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2191
2192 srcp += 4;
2193 dstp += 4;
2194 }, width);
2195 srcp += srcskip;
2196 dstp += dstskip;
2197 }
2198 emms();
2199 }
2200}
2201/* End GCC_ASMBLIT */
2202
2203#elif MSVC_ASMBLIT
2204/* fast RGB565->RGB565 blending with surface alpha */
2205static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2206{
2207 unsigned alpha = info->src->alpha;
2208 if(alpha == 128) {
2209 Blit16to16SurfaceAlpha128(info, 0xf7de);
2210 } else {
2211 int width = info->d_width;
2212 int height = info->d_height;
2213 Uint16 *srcp = (Uint16 *)info->s_pixels;
2214 int srcskip = info->s_skip >> 1;
2215 Uint16 *dstp = (Uint16 *)info->d_pixels;
2216 int dstskip = info->d_skip >> 1;
2217 Uint32 s, d;
2218
2219 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2220
2221 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2222 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2223 alpha >>= 3; /* downscale alpha to 5 bits */
2224
2225 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2226 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2227 /* position alpha to allow for mullo and mulhi on diff channels
2228 to reduce the number of operations */
2229 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2230
2231 /* Setup the 565 color channel masks */
2232 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2233 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2234
2235 while(height--) {
2236 DUFFS_LOOP_QUATRO2(
2237 {
2238 s = *srcp++;
2239 d = *dstp;
2240 /*
2241 * shift out the middle component (green) to
2242 * the high 16 bits, and process all three RGB
2243 * components at the same time.
2244 */
2245 s = (s | s << 16) & 0x07e0f81f;
2246 d = (d | d << 16) & 0x07e0f81f;
2247 d += (s - d) * alpha >> 5;
2248 d &= 0x07e0f81f;
2249 *dstp++ = (Uint16)(d | d >> 16);
2250 },{
2251 s = *srcp++;
2252 d = *dstp;
2253 /*
2254 * shift out the middle component (green) to
2255 * the high 16 bits, and process all three RGB
2256 * components at the same time.
2257 */
2258 s = (s | s << 16) & 0x07e0f81f;
2259 d = (d | d << 16) & 0x07e0f81f;
2260 d += (s - d) * alpha >> 5;
2261 d &= 0x07e0f81f;
2262 *dstp++ = (Uint16)(d | d >> 16);
2263 s = *srcp++;
2264 d = *dstp;
2265 /*
2266 * shift out the middle component (green) to
2267 * the high 16 bits, and process all three RGB
2268 * components at the same time.
2269 */
2270 s = (s | s << 16) & 0x07e0f81f;
2271 d = (d | d << 16) & 0x07e0f81f;
2272 d += (s - d) * alpha >> 5;
2273 d &= 0x07e0f81f;
2274 *dstp++ = (Uint16)(d | d >> 16);
2275 },{
2276 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2277 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2278
2279 /* red */
2280 src2 = src1;
2281 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2282
2283 dst2 = dst1;
2284 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2285
2286 /* blend */
2287 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2288 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2289 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2290 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2291 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2292
2293 mm_res = dst2; /* RED -> mm_res */
2294
2295 /* green -- process the bits in place */
2296 src2 = src1;
2297 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2298
2299 dst2 = dst1;
2300 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2301
2302 /* blend */
2303 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2304 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2305 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2306 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2307
2308 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2309
2310 /* blue */
2311 src2 = src1;
2312 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2313
2314 dst2 = dst1;
2315 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2316
2317 /* blend */
2318 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2319 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2320 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2321 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2322 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2323
2324 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2325
2326 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2327
2328 srcp += 4;
2329 dstp += 4;
2330 }, width);
2331 srcp += srcskip;
2332 dstp += dstskip;
2333 }
2334 _mm_empty();
2335 }
2336}
2337
2338/* fast RGB555->RGB555 blending with surface alpha */
2339static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2340{
2341 unsigned alpha = info->src->alpha;
2342 if(alpha == 128) {
2343 Blit16to16SurfaceAlpha128(info, 0xfbde);
2344 } else {
2345 int width = info->d_width;
2346 int height = info->d_height;
2347 Uint16 *srcp = (Uint16 *)info->s_pixels;
2348 int srcskip = info->s_skip >> 1;
2349 Uint16 *dstp = (Uint16 *)info->d_pixels;
2350 int dstskip = info->d_skip >> 1;
2351 Uint32 s, d;
2352
2353 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2354
2355 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2356 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2357 alpha >>= 3; /* downscale alpha to 5 bits */
2358
2359 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2360 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2361 /* position alpha to allow for mullo and mulhi on diff channels
2362 to reduce the number of operations */
2363 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2364
2365 /* Setup the 555 color channel masks */
2366 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2367 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2368 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2369
2370 while(height--) {
2371 DUFFS_LOOP_QUATRO2(
2372 {
2373 s = *srcp++;
2374 d = *dstp;
2375 /*
2376 * shift out the middle component (green) to
2377 * the high 16 bits, and process all three RGB
2378 * components at the same time.
2379 */
2380 s = (s | s << 16) & 0x03e07c1f;
2381 d = (d | d << 16) & 0x03e07c1f;
2382 d += (s - d) * alpha >> 5;
2383 d &= 0x03e07c1f;
2384 *dstp++ = (Uint16)(d | d >> 16);
2385 },{
2386 s = *srcp++;
2387 d = *dstp;
2388 /*
2389 * shift out the middle component (green) to
2390 * the high 16 bits, and process all three RGB
2391 * components at the same time.
2392 */
2393 s = (s | s << 16) & 0x03e07c1f;
2394 d = (d | d << 16) & 0x03e07c1f;
2395 d += (s - d) * alpha >> 5;
2396 d &= 0x03e07c1f;
2397 *dstp++ = (Uint16)(d | d >> 16);
2398 s = *srcp++;
2399 d = *dstp;
2400 /*
2401 * shift out the middle component (green) to
2402 * the high 16 bits, and process all three RGB
2403 * components at the same time.
2404 */
2405 s = (s | s << 16) & 0x03e07c1f;
2406 d = (d | d << 16) & 0x03e07c1f;
2407 d += (s - d) * alpha >> 5;
2408 d &= 0x03e07c1f;
2409 *dstp++ = (Uint16)(d | d >> 16);
2410 },{
2411 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2412 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2413
2414 /* red -- process the bits in place */
2415 src2 = src1;
2416 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2417
2418 dst2 = dst1;
2419 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2420
2421 /* blend */
2422 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2423 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2424 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2425 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2426 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2427
2428 mm_res = dst2; /* RED -> mm_res */
2429
2430 /* green -- process the bits in place */
2431 src2 = src1;
2432 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2433
2434 dst2 = dst1;
2435 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2436
2437 /* blend */
2438 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2439 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2440 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2441 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2442
2443 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2444
2445 /* blue */
2446 src2 = src1; /* src -> src2 */
2447 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2448
2449 dst2 = dst1; /* dst -> dst2 */
2450 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2451
2452 /* blend */
2453 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2454 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2455 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2456 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2457 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2458
2459 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2460
2461 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2462
2463 srcp += 4;
2464 dstp += 4;
2465 }, width);
2466 srcp += srcskip;
2467 dstp += dstskip;
2468 }
2469 _mm_empty();
2470 }
2471}
2472#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2473
2474/* fast RGB565->RGB565 blending with surface alpha */
2475static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2476{
2477 unsigned alpha = info->src->alpha;
2478 if(alpha == 128) {
2479 Blit16to16SurfaceAlpha128(info, 0xf7de);
2480 } else {
2481 int width = info->d_width;
2482 int height = info->d_height;
2483 Uint16 *srcp = (Uint16 *)info->s_pixels;
2484 int srcskip = info->s_skip >> 1;
2485 Uint16 *dstp = (Uint16 *)info->d_pixels;
2486 int dstskip = info->d_skip >> 1;
2487 alpha >>= 3; /* downscale alpha to 5 bits */
2488
2489 while(height--) {
2490 DUFFS_LOOP4({
2491 Uint32 s = *srcp++;
2492 Uint32 d = *dstp;
2493 /*
2494 * shift out the middle component (green) to
2495 * the high 16 bits, and process all three RGB
2496 * components at the same time.
2497 */
2498 s = (s | s << 16) & 0x07e0f81f;
2499 d = (d | d << 16) & 0x07e0f81f;
2500 d += (s - d) * alpha >> 5;
2501 d &= 0x07e0f81f;
2502 *dstp++ = (Uint16)(d | d >> 16);
2503 }, width);
2504 srcp += srcskip;
2505 dstp += dstskip;
2506 }
2507 }
2508}
2509
2510/* fast RGB555->RGB555 blending with surface alpha */
2511static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2512{
2513 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2514 if(alpha == 128) {
2515 Blit16to16SurfaceAlpha128(info, 0xfbde);
2516 } else {
2517 int width = info->d_width;
2518 int height = info->d_height;
2519 Uint16 *srcp = (Uint16 *)info->s_pixels;
2520 int srcskip = info->s_skip >> 1;
2521 Uint16 *dstp = (Uint16 *)info->d_pixels;
2522 int dstskip = info->d_skip >> 1;
2523 alpha >>= 3; /* downscale alpha to 5 bits */
2524
2525 while(height--) {
2526 DUFFS_LOOP4({
2527 Uint32 s = *srcp++;
2528 Uint32 d = *dstp;
2529 /*
2530 * shift out the middle component (green) to
2531 * the high 16 bits, and process all three RGB
2532 * components at the same time.
2533 */
2534 s = (s | s << 16) & 0x03e07c1f;
2535 d = (d | d << 16) & 0x03e07c1f;
2536 d += (s - d) * alpha >> 5;
2537 d &= 0x03e07c1f;
2538 *dstp++ = (Uint16)(d | d >> 16);
2539 }, width);
2540 srcp += srcskip;
2541 dstp += dstskip;
2542 }
2543 }
2544}
2545
2546/* fast ARGB8888->RGB565 blending with pixel alpha */
2547static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2548{
2549 int width = info->d_width;
2550 int height = info->d_height;
2551 Uint32 *srcp = (Uint32 *)info->s_pixels;
2552 int srcskip = info->s_skip >> 2;
2553 Uint16 *dstp = (Uint16 *)info->d_pixels;
2554 int dstskip = info->d_skip >> 1;
2555
2556 while(height--) {
2557 DUFFS_LOOP4({
2558 Uint32 s = *srcp;
2559 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2560 /* FIXME: Here we special-case opaque alpha since the
2561 compositioning used (>>8 instead of /255) doesn't handle
2562 it correctly. Also special-case alpha=0 for speed?
2563 Benchmark this! */
2564 if(alpha) {
2565 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2566 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2567 } else {
2568 Uint32 d = *dstp;
2569 /*
2570 * convert source and destination to G0RAB65565
2571 * and blend all components at the same time
2572 */
2573 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2574 + (s >> 3 & 0x1f);
2575 d = (d | d << 16) & 0x07e0f81f;
2576 d += (s - d) * alpha >> 5;
2577 d &= 0x07e0f81f;
2578 *dstp = (Uint16)(d | d >> 16);
2579 }
2580 }
2581 srcp++;
2582 dstp++;
2583 }, width);
2584 srcp += srcskip;
2585 dstp += dstskip;
2586 }
2587}
2588
2589/* fast ARGB8888->RGB555 blending with pixel alpha */
2590static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2591{
2592 int width = info->d_width;
2593 int height = info->d_height;
2594 Uint32 *srcp = (Uint32 *)info->s_pixels;
2595 int srcskip = info->s_skip >> 2;
2596 Uint16 *dstp = (Uint16 *)info->d_pixels;
2597 int dstskip = info->d_skip >> 1;
2598
2599 while(height--) {
2600 DUFFS_LOOP4({
2601 unsigned alpha;
2602 Uint32 s = *srcp;
2603 alpha = s >> 27; /* downscale alpha to 5 bits */
2604 /* FIXME: Here we special-case opaque alpha since the
2605 compositioning used (>>8 instead of /255) doesn't handle
2606 it correctly. Also special-case alpha=0 for speed?
2607 Benchmark this! */
2608 if(alpha) {
2609 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2610 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2611 } else {
2612 Uint32 d = *dstp;
2613 /*
2614 * convert source and destination to G0RAB65565
2615 * and blend all components at the same time
2616 */
2617 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2618 + (s >> 3 & 0x1f);
2619 d = (d | d << 16) & 0x03e07c1f;
2620 d += (s - d) * alpha >> 5;
2621 d &= 0x03e07c1f;
2622 *dstp = (Uint16)(d | d >> 16);
2623 }
2624 }
2625 srcp++;
2626 dstp++;
2627 }, width);
2628 srcp += srcskip;
2629 dstp += dstskip;
2630 }
2631}
2632
2633/* General (slow) N->N blending with per-surface alpha */
2634static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2635{
2636 int width = info->d_width;
2637 int height = info->d_height;
2638 Uint8 *src = info->s_pixels;
2639 int srcskip = info->s_skip;
2640 Uint8 *dst = info->d_pixels;
2641 int dstskip = info->d_skip;
2642 SDL_PixelFormat *srcfmt = info->src;
2643 SDL_PixelFormat *dstfmt = info->dst;
2644 int srcbpp = srcfmt->BytesPerPixel;
2645 int dstbpp = dstfmt->BytesPerPixel;
2646 unsigned sA = srcfmt->alpha;
2647 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2648
2649 if(sA) {
2650 while ( height-- ) {
2651 DUFFS_LOOP4(
2652 {
2653 Uint32 Pixel;
2654 unsigned sR;
2655 unsigned sG;
2656 unsigned sB;
2657 unsigned dR;
2658 unsigned dG;
2659 unsigned dB;
2660 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2661 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2662 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2663 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2664 src += srcbpp;
2665 dst += dstbpp;
2666 },
2667 width);
2668 src += srcskip;
2669 dst += dstskip;
2670 }
2671 }
2672}
2673
2674/* General (slow) colorkeyed N->N blending with per-surface alpha */
2675static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2676{
2677 int width = info->d_width;
2678 int height = info->d_height;
2679 Uint8 *src = info->s_pixels;
2680 int srcskip = info->s_skip;
2681 Uint8 *dst = info->d_pixels;
2682 int dstskip = info->d_skip;
2683 SDL_PixelFormat *srcfmt = info->src;
2684 SDL_PixelFormat *dstfmt = info->dst;
2685 Uint32 ckey = srcfmt->colorkey;
2686 int srcbpp = srcfmt->BytesPerPixel;
2687 int dstbpp = dstfmt->BytesPerPixel;
2688 unsigned sA = srcfmt->alpha;
2689 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2690
211e4bff 2691 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2692 Uint16 *src16 = (Uint16 *)src;
2693 Uint16 *dst16 = (Uint16 *)dst;
2694 sA >>= 3; /* downscale alpha to 5 bits */
2695 while ( height-- ) {
2696 DUFFS_LOOP4(
2697 {
2698 Uint32 s;
2699 Uint32 d;
2700 s = *src16;
2701 if(sA && s != ckey) {
2702 d = *dst16;
2703 s = (s | s << 16) & 0x07e0f81f;
2704 d = (d | d << 16) & 0x07e0f81f;
2705 d += (s - d) * sA >> 5;
2706 d &= 0x07e0f81f;
2707 *dst16 = (Uint16)(d | d >> 16);
2708 }
2709 src16++;
2710 dst16++;
2711 },
2712 width);
2713 src16 += srcskip / 2;
2714 dst16 += dstskip / 2;
2715 }
2716 return;
2717 }
2718
e14743d1 2719 while ( height-- ) {
2720 DUFFS_LOOP4(
2721 {
2722 Uint32 Pixel;
2723 unsigned sR;
2724 unsigned sG;
2725 unsigned sB;
2726 unsigned dR;
2727 unsigned dG;
2728 unsigned dB;
2729 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2730 if(sA && Pixel != ckey) {
2731 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2732 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2733 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2734 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2735 }
2736 src += srcbpp;
2737 dst += dstbpp;
2738 },
2739 width);
2740 src += srcskip;
2741 dst += dstskip;
2742 }
2743}
2744
2745/* General (slow) N->N blending with pixel alpha */
2746static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2747{
2748 int width = info->d_width;
2749 int height = info->d_height;
2750 Uint8 *src = info->s_pixels;
2751 int srcskip = info->s_skip;
2752 Uint8 *dst = info->d_pixels;
2753 int dstskip = info->d_skip;
2754 SDL_PixelFormat *srcfmt = info->src;
2755 SDL_PixelFormat *dstfmt = info->dst;
2756
2757 int srcbpp;
2758 int dstbpp;
2759
2760 /* Set up some basic variables */
2761 srcbpp = srcfmt->BytesPerPixel;
2762 dstbpp = dstfmt->BytesPerPixel;
2763
2764 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2765 quite right. for <8bpp source alpha, it gets them very wrong
2766 (check all macros!)
2767 It is unclear whether there is a good general solution that doesn't
2768 need a branch (or a divide). */
2769 while ( height-- ) {
2770 DUFFS_LOOP4(
2771 {
2772 Uint32 Pixel;
2773 unsigned sR;
2774 unsigned sG;
2775 unsigned sB;
2776 unsigned dR;
2777 unsigned dG;
2778 unsigned dB;
2779 unsigned sA;
2780 unsigned dA;
2781 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2782 if(sA) {
2783 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2784 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2785 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2786 }
2787 src += srcbpp;
2788 dst += dstbpp;
2789 },
2790 width);
2791 src += srcskip;
2792 dst += dstskip;
2793 }
2794}
2795
2796
2797SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2798{
2799 SDL_PixelFormat *sf = surface->format;
2800 SDL_PixelFormat *df = surface->map->dst->format;
2801
2802 if(sf->Amask == 0) {
2803 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2804 if(df->BytesPerPixel == 1)
2805 return BlitNto1SurfaceAlphaKey;
2806 else
2807#if SDL_ALTIVEC_BLITTERS
2808 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2809 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2810 return Blit32to32SurfaceAlphaKeyAltivec;
2811 else
2812#endif
2813 return BlitNtoNSurfaceAlphaKey;
2814 } else {
2815 /* Per-surface alpha blits */
2816 switch(df->BytesPerPixel) {
2817 case 1:
2818 return BlitNto1SurfaceAlpha;
2819
2820 case 2:
2821 if(surface->map->identity) {
2822 if(df->Gmask == 0x7e0)
2823 {
2824#if MMX_ASMBLIT
2825 if(SDL_HasMMX())
2826 return Blit565to565SurfaceAlphaMMX;
2827 else
2828#endif
2829 return Blit565to565SurfaceAlpha;
2830 }
2831 else if(df->Gmask == 0x3e0)
2832 {
2833#if MMX_ASMBLIT
2834 if(SDL_HasMMX())
2835 return Blit555to555SurfaceAlphaMMX;
2836 else
2837#endif
2838 return Blit555to555SurfaceAlpha;
2839 }
2840 }
2841 return BlitNtoNSurfaceAlpha;
2842
2843 case 4:
2844 if(sf->Rmask == df->Rmask
2845 && sf->Gmask == df->Gmask
2846 && sf->Bmask == df->Bmask
2847 && sf->BytesPerPixel == 4)
2848 {
2849#if MMX_ASMBLIT
2850 if(sf->Rshift % 8 == 0
2851 && sf->Gshift % 8 == 0
2852 && sf->Bshift % 8 == 0
2853 && SDL_HasMMX())
2854 return BlitRGBtoRGBSurfaceAlphaMMX;
bdfa6989 2855#endif
2856#ifdef __ARM_NEON__
2857 if(sf->Rshift % 8 == 0
2858 && sf->Gshift % 8 == 0
2859 && sf->Bshift % 8 == 0)
c85a5291 2860 {
bdfa6989 2861 return BlitARGBtoXRGBalphaS_neon;
c85a5291 2862 }
e14743d1 2863#endif
2864 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2865 {
2866#if SDL_ALTIVEC_BLITTERS
2867 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2868 && SDL_HasAltiVec())
2869 return BlitRGBtoRGBSurfaceAlphaAltivec;
2870#endif
2871 return BlitRGBtoRGBSurfaceAlpha;
2872 }
2873 }
c85a5291 2874#ifdef __ARM_NEON__
2875 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2876 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2877 {
2878 return BlitABGRtoXRGBalphaS_neon;
2879 }
2880#endif
e14743d1 2881#if SDL_ALTIVEC_BLITTERS
2882 if((sf->BytesPerPixel == 4) &&
2883 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2884 return Blit32to32SurfaceAlphaAltivec;
2885 else
2886#endif
2887 return BlitNtoNSurfaceAlpha;
2888
2889 case 3:
2890 default:
2891 return BlitNtoNSurfaceAlpha;
2892 }
2893 }
2894 } else {
2895 /* Per-pixel alpha blits */
2896 switch(df->BytesPerPixel) {
2897 case 1:
2898 return BlitNto1PixelAlpha;
2899
2900 case 2:
2901#if SDL_ALTIVEC_BLITTERS
2902 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2903 df->Gmask == 0x7e0 &&
2904 df->Bmask == 0x1f && SDL_HasAltiVec())
2905 return Blit32to565PixelAlphaAltivec;
2906 else
2907#endif
2908 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2909 && sf->Gmask == 0xff00
2910 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2911 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2912 if(df->Gmask == 0x7e0)
2913 return BlitARGBto565PixelAlpha;
2914 else if(df->Gmask == 0x3e0)
2915 return BlitARGBto555PixelAlpha;
2916 }
2917 return BlitNtoNPixelAlpha;
2918
2919 case 4:
2920 if(sf->Rmask == df->Rmask
2921 && sf->Gmask == df->Gmask
2922 && sf->Bmask == df->Bmask
2923 && sf->BytesPerPixel == 4)
2924 {
2925#if MMX_ASMBLIT
2926 if(sf->Rshift % 8 == 0
2927 && sf->Gshift % 8 == 0
2928 && sf->Bshift % 8 == 0
2929 && sf->Ashift % 8 == 0
2930 && sf->Aloss == 0)
2931 {
2932 if(SDL_Has3DNow())
2933 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2934 if(SDL_HasMMX())
2935 return BlitRGBtoRGBPixelAlphaMMX;
2936 }
c85a5291 2937#endif
2938#ifdef __ARM_NEON__
2939 if(sf->Rshift % 8 == 0
2940 && sf->Gshift % 8 == 0
2941 && sf->Bshift % 8 == 0
2942 && sf->Ashift % 8 == 0)
2943 {
2944 return BlitARGBtoXRGBalpha_neon;
2945 }
e14743d1 2946#endif
2947 if(sf->Amask == 0xff000000)
2948 {
2949#if SDL_ALTIVEC_BLITTERS
2950 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2951 && SDL_HasAltiVec())
2952 return BlitRGBtoRGBPixelAlphaAltivec;
2953#endif
2954 return BlitRGBtoRGBPixelAlpha;
2955 }
2956 }
a1f34081 2957#ifdef __ARM_NEON__
c85a5291 2958 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2959 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2960 && sf->Amask == 0xff000000)
a1f34081 2961 {
2962 return BlitABGRtoXRGBalpha_neon;
2963 }
2964#endif
e14743d1 2965#if SDL_ALTIVEC_BLITTERS
2966 if (sf->Amask && sf->BytesPerPixel == 4 &&
2967 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2968 return Blit32to32PixelAlphaAltivec;
2969 else
2970#endif
2971 return BlitNtoNPixelAlpha;
2972
2973 case 3:
2974 default:
2975 return BlitNtoNPixelAlpha;
2976 }
2977 }
2978}
2979