NEONize a few more blit types
[sdl_omap.git] / src / video / SDL_blit_A.c
CommitLineData
e14743d1 1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35# define MMX_ASMBLIT 1
36# define GCC_ASMBLIT 1
37# elif defined(_MSC_VER) && defined(_M_IX86)
38# if (_MSC_VER <= 1200)
39# include <malloc.h>
40# if defined(_mm_free)
41# define HAVE_MMINTRIN_H 1
42# endif
43# else /* Visual Studio > VC6 always has mmintrin.h */
44# define HAVE_MMINTRIN_H 1
45# endif
46# if HAVE_MMINTRIN_H
47# define MMX_ASMBLIT 1
48# define MSVC_ASMBLIT 1
49# endif
50# endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
a1f34081 64#ifdef __ARM_NEON__
65
66/* NEON optimized blitter callers */
67#define make_neon_caller(name, neon_name) \
68extern void neon_name(void *dst, const void *src, int count); \
69static void name(SDL_BlitInfo *info) \
70{ \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
2c4e54dd 75 int dstBpp = info->dst->BytesPerPixel; \
76 int srcstride = width * 4 + info->s_skip; \
77 int dststride = width * dstBpp + info->d_skip; \
a1f34081 78\
79 while ( height-- ) { \
2c4e54dd 80 neon_name(dst, src, width); \
81 src += srcstride; \
82 dst += dststride; \
a1f34081 83 } \
84}
85
bdfa6989 86#define make_neon_callerS(name, neon_name) \
87extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
88static void name(SDL_BlitInfo *info) \
89{ \
90 int width = info->d_width; \
91 int height = info->d_height; \
92 Uint8 *src = info->s_pixels; \
93 Uint8 *dst = info->d_pixels; \
94 int srcskip = info->s_skip; \
95 int dstskip = info->d_skip; \
96 unsigned alpha = info->src->alpha;\
97\
98 while ( height-- ) { \
99 neon_name(dst, src, width, alpha); \
100 src += width * 4 + srcskip; \
101 dst += width * 4 + dstskip; \
102 } \
103}
104
a1f34081 105make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
106make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
2c4e54dd 107make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
108make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
bdfa6989 109make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
110make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
a1f34081 111
112#endif /* __ARM_NEON__ */
113
e14743d1 114/* N->1 blending with per-surface alpha */
115static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
116{
117 int width = info->d_width;
118 int height = info->d_height;
119 Uint8 *src = info->s_pixels;
120 int srcskip = info->s_skip;
121 Uint8 *dst = info->d_pixels;
122 int dstskip = info->d_skip;
123 Uint8 *palmap = info->table;
124 SDL_PixelFormat *srcfmt = info->src;
125 SDL_PixelFormat *dstfmt = info->dst;
126 int srcbpp = srcfmt->BytesPerPixel;
127
128 const unsigned A = srcfmt->alpha;
129
130 while ( height-- ) {
131 DUFFS_LOOP4(
132 {
133 Uint32 Pixel;
134 unsigned sR;
135 unsigned sG;
136 unsigned sB;
137 unsigned dR;
138 unsigned dG;
139 unsigned dB;
140 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
141 dR = dstfmt->palette->colors[*dst].r;
142 dG = dstfmt->palette->colors[*dst].g;
143 dB = dstfmt->palette->colors[*dst].b;
144 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
145 dR &= 0xff;
146 dG &= 0xff;
147 dB &= 0xff;
148 /* Pack RGB into 8bit pixel */
149 if ( palmap == NULL ) {
150 *dst =((dR>>5)<<(3+2))|
151 ((dG>>5)<<(2))|
152 ((dB>>6)<<(0));
153 } else {
154 *dst = palmap[((dR>>5)<<(3+2))|
155 ((dG>>5)<<(2)) |
156 ((dB>>6)<<(0))];
157 }
158 dst++;
159 src += srcbpp;
160 },
161 width);
162 src += srcskip;
163 dst += dstskip;
164 }
165}
166
167/* N->1 blending with pixel alpha */
168static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
169{
170 int width = info->d_width;
171 int height = info->d_height;
172 Uint8 *src = info->s_pixels;
173 int srcskip = info->s_skip;
174 Uint8 *dst = info->d_pixels;
175 int dstskip = info->d_skip;
176 Uint8 *palmap = info->table;
177 SDL_PixelFormat *srcfmt = info->src;
178 SDL_PixelFormat *dstfmt = info->dst;
179 int srcbpp = srcfmt->BytesPerPixel;
180
181 /* FIXME: fix alpha bit field expansion here too? */
182 while ( height-- ) {
183 DUFFS_LOOP4(
184 {
185 Uint32 Pixel;
186 unsigned sR;
187 unsigned sG;
188 unsigned sB;
189 unsigned sA;
190 unsigned dR;
191 unsigned dG;
192 unsigned dB;
193 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
194 dR = dstfmt->palette->colors[*dst].r;
195 dG = dstfmt->palette->colors[*dst].g;
196 dB = dstfmt->palette->colors[*dst].b;
197 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
198 dR &= 0xff;
199 dG &= 0xff;
200 dB &= 0xff;
201 /* Pack RGB into 8bit pixel */
202 if ( palmap == NULL ) {
203 *dst =((dR>>5)<<(3+2))|
204 ((dG>>5)<<(2))|
205 ((dB>>6)<<(0));
206 } else {
207 *dst = palmap[((dR>>5)<<(3+2))|
208 ((dG>>5)<<(2)) |
209 ((dB>>6)<<(0)) ];
210 }
211 dst++;
212 src += srcbpp;
213 },
214 width);
215 src += srcskip;
216 dst += dstskip;
217 }
218}
219
220/* colorkeyed N->1 blending with per-surface alpha */
221static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
222{
223 int width = info->d_width;
224 int height = info->d_height;
225 Uint8 *src = info->s_pixels;
226 int srcskip = info->s_skip;
227 Uint8 *dst = info->d_pixels;
228 int dstskip = info->d_skip;
229 Uint8 *palmap = info->table;
230 SDL_PixelFormat *srcfmt = info->src;
231 SDL_PixelFormat *dstfmt = info->dst;
232 int srcbpp = srcfmt->BytesPerPixel;
233 Uint32 ckey = srcfmt->colorkey;
234
235 const int A = srcfmt->alpha;
236
237 while ( height-- ) {
238 DUFFS_LOOP(
239 {
240 Uint32 Pixel;
241 unsigned sR;
242 unsigned sG;
243 unsigned sB;
244 unsigned dR;
245 unsigned dG;
246 unsigned dB;
247 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
248 if ( Pixel != ckey ) {
249 dR = dstfmt->palette->colors[*dst].r;
250 dG = dstfmt->palette->colors[*dst].g;
251 dB = dstfmt->palette->colors[*dst].b;
252 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
253 dR &= 0xff;
254 dG &= 0xff;
255 dB &= 0xff;
256 /* Pack RGB into 8bit pixel */
257 if ( palmap == NULL ) {
258 *dst =((dR>>5)<<(3+2))|
259 ((dG>>5)<<(2)) |
260 ((dB>>6)<<(0));
261 } else {
262 *dst = palmap[((dR>>5)<<(3+2))|
263 ((dG>>5)<<(2)) |
264 ((dB>>6)<<(0)) ];
265 }
266 }
267 dst++;
268 src += srcbpp;
269 },
270 width);
271 src += srcskip;
272 dst += dstskip;
273 }
274}
275
276#if GCC_ASMBLIT
277/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
278static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
279{
280 int width = info->d_width;
281 int height = info->d_height;
282 Uint32 *srcp = (Uint32 *)info->s_pixels;
283 int srcskip = info->s_skip >> 2;
284 Uint32 *dstp = (Uint32 *)info->d_pixels;
285 int dstskip = info->d_skip >> 2;
286 Uint32 dalpha = info->dst->Amask;
287 Uint64 load;
288
289 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
290 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
291 load = 0x0001010100010101ULL;/* !alpha128 mask */
292 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
293 movd_m2r(dalpha, mm7); /* dst alpha mask */
294 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
295 while(height--) {
296 DUFFS_LOOP_DOUBLE2(
297 {
298 Uint32 s = *srcp++;
299 Uint32 d = *dstp;
300 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
301 + (s & d & 0x00010101)) | dalpha;
302 },{
303 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
304 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
305
306 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
307 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
308
309 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
310 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
311 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
312 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
313 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
314 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
315 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
316
317 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
318 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
319 dstp += 2;
320 srcp += 2;
321 }, width);
322 srcp += srcskip;
323 dstp += dstskip;
324 }
325 emms();
326}
327
328/* fast RGB888->(A)RGB888 blending with surface alpha */
329static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
330{
331 SDL_PixelFormat* df = info->dst;
332 unsigned alpha = info->src->alpha;
333
334 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
335 /* only call a128 version when R,G,B occupy lower bits */
336 BlitRGBtoRGBSurfaceAlpha128MMX(info);
337 } else {
338 int width = info->d_width;
339 int height = info->d_height;
340 Uint32 *srcp = (Uint32 *)info->s_pixels;
341 int srcskip = info->s_skip >> 2;
342 Uint32 *dstp = (Uint32 *)info->d_pixels;
343 int dstskip = info->d_skip >> 2;
344
345 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
346 /* form the alpha mult */
347 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
348 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
349 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
350 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
351 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
352 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
353 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
354 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
355 movd_m2r(df->Amask, mm7); /* dst alpha mask */
356 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
357
358 while(height--) {
359 DUFFS_LOOP_DOUBLE2({
360 /* One Pixel Blend */
361 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
362 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
363 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
364 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
365
366 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
367 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
368 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
369 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
370
371 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
372 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
373 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
374 ++srcp;
375 ++dstp;
376 },{
377 /* Two Pixels Blend */
378 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
379 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
380 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
381 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
382
383 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
384 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
385 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
386 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
387
388 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
389 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
390 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
391 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
392
393 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
394 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
395 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
396 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
397
398 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
399 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
400
401 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
402
403 srcp += 2;
404 dstp += 2;
405 }, width);
406 srcp += srcskip;
407 dstp += dstskip;
408 }
409 emms();
410 }
411}
412
413/* fast ARGB888->(A)RGB888 blending with pixel alpha */
414static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
415{
416 int width = info->d_width;
417 int height = info->d_height;
418 Uint32 *srcp = (Uint32 *)info->s_pixels;
419 int srcskip = info->s_skip >> 2;
420 Uint32 *dstp = (Uint32 *)info->d_pixels;
421 int dstskip = info->d_skip >> 2;
422 SDL_PixelFormat* sf = info->src;
423 Uint32 amask = sf->Amask;
424
425 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
426 /* form multiplication mask */
427 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
428 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
429 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
430 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
431 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
432 /* form channel masks */
433 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
434 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
435 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
436 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
437 /* get alpha channel shift */
438 __asm__ __volatile__ (
439 "movd %0, %%mm5"
440 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
441
442 while(height--) {
443 DUFFS_LOOP4({
444 Uint32 alpha = *srcp & amask;
445 /* FIXME: Here we special-case opaque alpha since the
446 compositioning used (>>8 instead of /255) doesn't handle
447 it correctly. Also special-case alpha=0 for speed?
448 Benchmark this! */
449 if(alpha == 0) {
450 /* do nothing */
451 } else if(alpha == amask) {
452 /* opaque alpha -- copy RGB, keep dst alpha */
453 /* using MMX here to free up regular registers for other things */
454 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
455 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
456 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
457 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
458 por_r2r(mm1, mm2); /* src | dst -> mm2 */
459 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
460 } else {
461 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
462 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
463
464 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
465 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
466
467 __asm__ __volatile__ (
468 "movd %0, %%mm4"
469 : : "r" (alpha) ); /* 0000A000 -> mm4 */
470 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
471 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
472 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
473 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
474
475 /* blend */
476 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
477 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
478 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
479 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
480
481 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
482 movd_r2m(mm2, *dstp);/* mm2 -> dst */
483 }
484 ++srcp;
485 ++dstp;
486 }, width);
487 srcp += srcskip;
488 dstp += dstskip;
489 }
490 emms();
491}
492/* End GCC_ASMBLIT */
493
494#elif MSVC_ASMBLIT
495/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
496static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
497{
498 int width = info->d_width;
499 int height = info->d_height;
500 Uint32 *srcp = (Uint32 *)info->s_pixels;
501 int srcskip = info->s_skip >> 2;
502 Uint32 *dstp = (Uint32 *)info->d_pixels;
503 int dstskip = info->d_skip >> 2;
504 Uint32 dalpha = info->dst->Amask;
505
506 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
507
508 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
509 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
510 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
511
512 while (height--) {
513 int n = width;
514 if ( n & 1 ) {
515 Uint32 s = *srcp++;
516 Uint32 d = *dstp;
517 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
518 + (s & d & 0x00010101)) | dalpha;
519 n--;
520 }
521
522 for (n >>= 1; n > 0; --n) {
523 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
524 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
525
526 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
527 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
528
529 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
530 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
531 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
532 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
533
534 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
535 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
536 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
537 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
538
539 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
540 dstp += 2;
541 srcp += 2;
542 }
543
544 srcp += srcskip;
545 dstp += dstskip;
546 }
547 _mm_empty();
548}
549
550/* fast RGB888->(A)RGB888 blending with surface alpha */
551static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
552{
553 SDL_PixelFormat* df = info->dst;
554 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
555 unsigned alpha = info->src->alpha;
556
557 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
558 /* only call a128 version when R,G,B occupy lower bits */
559 BlitRGBtoRGBSurfaceAlpha128MMX(info);
560 } else {
561 int width = info->d_width;
562 int height = info->d_height;
563 Uint32 *srcp = (Uint32 *)info->s_pixels;
564 int srcskip = info->s_skip >> 2;
565 Uint32 *dstp = (Uint32 *)info->d_pixels;
566 int dstskip = info->d_skip >> 2;
567 Uint32 dalpha = df->Amask;
568 Uint32 amult;
569
570 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
571
572 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
573 /* form the alpha mult */
574 amult = alpha | (alpha << 8);
575 amult = amult | (amult << 16);
576 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
577 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
578 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
579 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
580 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
581
582 while (height--) {
583 int n = width;
584 if (n & 1) {
585 /* One Pixel Blend */
586 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
587 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
588
589 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
590 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
591
592 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
593 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
594 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
595 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
596
597 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
598 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
599 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
600
601 ++srcp;
602 ++dstp;
603
604 n--;
605 }
606
607 for (n >>= 1; n > 0; --n) {
608 /* Two Pixels Blend */
609 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
610 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
611 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
612 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
613
614 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
615 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
616 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
617 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
618
619 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
620 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
621 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
622 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
623
624 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
625 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
626 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
627 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
628
629 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
630 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
631
632 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
633
634 srcp += 2;
635 dstp += 2;
636 }
637 srcp += srcskip;
638 dstp += dstskip;
639 }
640 _mm_empty();
641 }
642}
643
644/* fast ARGB888->(A)RGB888 blending with pixel alpha */
645static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
646{
647 int width = info->d_width;
648 int height = info->d_height;
649 Uint32 *srcp = (Uint32 *)info->s_pixels;
650 int srcskip = info->s_skip >> 2;
651 Uint32 *dstp = (Uint32 *)info->d_pixels;
652 int dstskip = info->d_skip >> 2;
653 SDL_PixelFormat* sf = info->src;
654 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
655 Uint32 amask = sf->Amask;
656 Uint32 ashift = sf->Ashift;
657 Uint64 multmask;
658
659 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
660
661 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
662 multmask = ~(0xFFFFi64 << (ashift * 2));
663 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
664
665 while(height--) {
666 DUFFS_LOOP4({
667 Uint32 alpha = *srcp & amask;
668 if (alpha == 0) {
669 /* do nothing */
670 } else if (alpha == amask) {
671 /* opaque alpha -- copy RGB, keep dst alpha */
672 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
673 } else {
674 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
675 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
676
677 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
678 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
679
680 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
681 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
682 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
683 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
684 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
685
686 /* blend */
687 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
688 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
689 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
690 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
691 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
692
693 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
694 }
695 ++srcp;
696 ++dstp;
697 }, width);
698 srcp += srcskip;
699 dstp += dstskip;
700 }
701 _mm_empty();
702}
703/* End MSVC_ASMBLIT */
704
705#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
706
707#if SDL_ALTIVEC_BLITTERS
708#if __MWERKS__
709#pragma altivec_model on
710#endif
711#if HAVE_ALTIVEC_H
712#include <altivec.h>
713#endif
714#include <assert.h>
715
716#if (defined(__MACOSX__) && (__GNUC__ < 4))
717 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
718 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
719 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
720 (vector unsigned short) ( a,b,c,d,e,f,g,h )
721#else
722 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
723 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
724 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
725 (vector unsigned short) { a,b,c,d,e,f,g,h }
726#endif
727
728#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
729#define VECPRINT(msg, v) do { \
730 vector unsigned int tmpvec = (vector unsigned int)(v); \
731 unsigned int *vp = (unsigned int *)&tmpvec; \
732 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
733} while (0)
734
735/* the permuation vector that takes the high bytes out of all the appropriate shorts
736 (vector unsigned char)(
737 0x00, 0x10, 0x02, 0x12,
738 0x04, 0x14, 0x06, 0x16,
739 0x08, 0x18, 0x0A, 0x1A,
740 0x0C, 0x1C, 0x0E, 0x1E );
741*/
742#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
743#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
744#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
745#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
746 ? vec_lvsl(0, src) \
747 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
748
749
750#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
751 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
752 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
753 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
754 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
755 /* valpha2 is 255-alpha */ \
756 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
757 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
758 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
759 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
760 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
761 /* add source and dest */ \
762 vtemp1 = vec_add(vtemp1, vtemp3); \
763 vtemp2 = vec_add(vtemp2, vtemp4); \
764 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
765 vtemp1 = vec_add(vtemp1, v1_16); \
766 vtemp3 = vec_sr(vtemp1, v8_16); \
767 vtemp1 = vec_add(vtemp1, vtemp3); \
768 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
769 vtemp2 = vec_add(vtemp2, v1_16); \
770 vtemp4 = vec_sr(vtemp2, v8_16); \
771 vtemp2 = vec_add(vtemp2, vtemp4); \
772 /* (>>8) and get ARGBARGBARGBARGB */ \
773 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
774} while (0)
775
776/* Calculate the permute vector used for 32->32 swizzling */
777static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
778 const SDL_PixelFormat *dstfmt)
779{
780 /*
781 * We have to assume that the bits that aren't used by other
782 * colors is alpha, and it's one complete byte, since some formats
783 * leave alpha with a zero mask, but we should still swizzle the bits.
784 */
785 /* ARGB */
786 const static struct SDL_PixelFormat default_pixel_format = {
787 NULL, 0, 0,
788 0, 0, 0, 0,
789 16, 8, 0, 24,
790 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
791 0, 0};
792 if (!srcfmt) {
793 srcfmt = &default_pixel_format;
794 }
795 if (!dstfmt) {
796 dstfmt = &default_pixel_format;
797 }
798 const vector unsigned char plus = VECUINT8_LITERAL
799 ( 0x00, 0x00, 0x00, 0x00,
800 0x04, 0x04, 0x04, 0x04,
801 0x08, 0x08, 0x08, 0x08,
802 0x0C, 0x0C, 0x0C, 0x0C );
803 vector unsigned char vswiz;
804 vector unsigned int srcvec;
805#define RESHIFT(X) (3 - ((X) >> 3))
806 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
807 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
808 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
809 Uint32 amask;
810 /* Use zero for alpha if either surface doesn't have alpha */
811 if (dstfmt->Amask) {
812 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
813 } else {
814 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
815 }
816#undef RESHIFT
817 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
818 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
819 return(vswiz);
820}
821
822static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
823{
824 int height = info->d_height;
825 Uint8 *src = (Uint8 *)info->s_pixels;
826 int srcskip = info->s_skip;
827 Uint8 *dst = (Uint8 *)info->d_pixels;
828 int dstskip = info->d_skip;
829 SDL_PixelFormat *srcfmt = info->src;
830
831 vector unsigned char v0 = vec_splat_u8(0);
832 vector unsigned short v8_16 = vec_splat_u16(8);
833 vector unsigned short v1_16 = vec_splat_u16(1);
834 vector unsigned short v2_16 = vec_splat_u16(2);
835 vector unsigned short v3_16 = vec_splat_u16(3);
836 vector unsigned int v8_32 = vec_splat_u32(8);
837 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
838 vector unsigned short v3f = VECUINT16_LITERAL(
839 0x003f, 0x003f, 0x003f, 0x003f,
840 0x003f, 0x003f, 0x003f, 0x003f);
841 vector unsigned short vfc = VECUINT16_LITERAL(
842 0x00fc, 0x00fc, 0x00fc, 0x00fc,
843 0x00fc, 0x00fc, 0x00fc, 0x00fc);
844
845 /*
846 0x10 - 0x1f is the alpha
847 0x00 - 0x0e evens are the red
848 0x01 - 0x0f odds are zero
849 */
850 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
851 0x10, 0x00, 0x01, 0x01,
852 0x10, 0x02, 0x01, 0x01,
853 0x10, 0x04, 0x01, 0x01,
854 0x10, 0x06, 0x01, 0x01
855 );
856 vector unsigned char vredalpha2 = (vector unsigned char)(
857 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
858 );
859 /*
860 0x00 - 0x0f is ARxx ARxx ARxx ARxx
861 0x11 - 0x0f odds are blue
862 */
863 vector unsigned char vblue1 = VECUINT8_LITERAL(
864 0x00, 0x01, 0x02, 0x11,
865 0x04, 0x05, 0x06, 0x13,
866 0x08, 0x09, 0x0a, 0x15,
867 0x0c, 0x0d, 0x0e, 0x17
868 );
869 vector unsigned char vblue2 = (vector unsigned char)(
870 vec_add((vector unsigned int)vblue1, v8_32)
871 );
872 /*
873 0x00 - 0x0f is ARxB ARxB ARxB ARxB
874 0x10 - 0x0e evens are green
875 */
876 vector unsigned char vgreen1 = VECUINT8_LITERAL(
877 0x00, 0x01, 0x10, 0x03,
878 0x04, 0x05, 0x12, 0x07,
879 0x08, 0x09, 0x14, 0x0b,
880 0x0c, 0x0d, 0x16, 0x0f
881 );
882 vector unsigned char vgreen2 = (vector unsigned char)(
883 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
884 );
885 vector unsigned char vgmerge = VECUINT8_LITERAL(
886 0x00, 0x02, 0x00, 0x06,
887 0x00, 0x0a, 0x00, 0x0e,
888 0x00, 0x12, 0x00, 0x16,
889 0x00, 0x1a, 0x00, 0x1e);
890 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
891 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
892 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
893
894 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
895 vf800 = vec_sl(vf800, vec_splat_u16(8));
896
897 while(height--) {
898 int extrawidth;
899 vector unsigned char valigner;
900 vector unsigned char vsrc;
901 vector unsigned char voverflow;
902 int width = info->d_width;
903
904#define ONE_PIXEL_BLEND(condition, widthvar) \
905 while (condition) { \
906 Uint32 Pixel; \
907 unsigned sR, sG, sB, dR, dG, dB, sA; \
908 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
909 if(sA) { \
910 unsigned short dstpixel = *((unsigned short *)dst); \
911 dR = (dstpixel >> 8) & 0xf8; \
912 dG = (dstpixel >> 3) & 0xfc; \
913 dB = (dstpixel << 3) & 0xf8; \
914 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
915 *((unsigned short *)dst) = ( \
916 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
917 ); \
918 } \
919 src += 4; \
920 dst += 2; \
921 widthvar--; \
922 }
923 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
924 extrawidth = (width % 8);
925 valigner = VEC_ALIGNER(src);
926 vsrc = (vector unsigned char)vec_ld(0, src);
927 width -= extrawidth;
928 while (width) {
929 vector unsigned char valpha;
930 vector unsigned char vsrc1, vsrc2;
931 vector unsigned char vdst1, vdst2;
932 vector unsigned short vR, vG, vB;
933 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
934
935 /* Load 8 pixels from src as ARGB */
936 voverflow = (vector unsigned char)vec_ld(15, src);
937 vsrc = vec_perm(vsrc, voverflow, valigner);
938 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
939 src += 16;
940 vsrc = (vector unsigned char)vec_ld(15, src);
941 voverflow = vec_perm(voverflow, vsrc, valigner);
942 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
943 src += 16;
944
945 /* Load 8 pixels from dst as XRGB */
946 voverflow = vec_ld(0, dst);
947 vR = vec_and((vector unsigned short)voverflow, vf800);
948 vB = vec_sl((vector unsigned short)voverflow, v3_16);
949 vG = vec_sl(vB, v2_16);
950 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
951 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
952 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
953 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
954 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
955 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
956
957 /* Alpha blend 8 pixels as ARGB */
958 valpha = vec_perm(vsrc1, v0, valphaPermute);
959 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
960 valpha = vec_perm(vsrc2, v0, valphaPermute);
961 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
962
963 /* Convert 8 pixels to 565 */
964 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
965 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
966 vgpixel = vec_and(vgpixel, vfc);
967 vgpixel = vec_sl(vgpixel, v3_16);
968 vrpixel = vec_sl(vpixel, v1_16);
969 vrpixel = vec_and(vrpixel, vf800);
970 vbpixel = vec_and(vpixel, v3f);
971 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
972 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
973
974 /* Store 8 pixels */
975 vec_st(vdst1, 0, dst);
976
977 width -= 8;
978 dst += 16;
979 }
980 ONE_PIXEL_BLEND((extrawidth), extrawidth);
981#undef ONE_PIXEL_BLEND
982 src += srcskip;
983 dst += dstskip;
984 }
985}
986
987static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
988{
989 unsigned alpha = info->src->alpha;
990 int height = info->d_height;
991 Uint32 *srcp = (Uint32 *)info->s_pixels;
992 int srcskip = info->s_skip >> 2;
993 Uint32 *dstp = (Uint32 *)info->d_pixels;
994 int dstskip = info->d_skip >> 2;
995 SDL_PixelFormat *srcfmt = info->src;
996 SDL_PixelFormat *dstfmt = info->dst;
997 unsigned sA = srcfmt->alpha;
998 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
999 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1000 Uint32 ckey = info->src->colorkey;
1001 vector unsigned char mergePermute;
1002 vector unsigned char vsrcPermute;
1003 vector unsigned char vdstPermute;
1004 vector unsigned char vsdstPermute;
1005 vector unsigned char valpha;
1006 vector unsigned char valphamask;
1007 vector unsigned char vbits;
1008 vector unsigned char v0;
1009 vector unsigned short v1;
1010 vector unsigned short v8;
1011 vector unsigned int vckey;
1012 vector unsigned int vrgbmask;
1013
1014 mergePermute = VEC_MERGE_PERMUTE();
1015 v0 = vec_splat_u8(0);
1016 v1 = vec_splat_u16(1);
1017 v8 = vec_splat_u16(8);
1018
1019 /* set the alpha to 255 on the destination surf */
1020 valphamask = VEC_ALPHA_MASK();
1021
1022 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1023 vdstPermute = calc_swizzle32(NULL, dstfmt);
1024 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1025
1026 /* set a vector full of alpha and 255-alpha */
1027 ((unsigned char *)&valpha)[0] = alpha;
1028 valpha = vec_splat(valpha, 0);
1029 vbits = (vector unsigned char)vec_splat_s8(-1);
1030
1031 ckey &= rgbmask;
1032 ((unsigned int *)(char*)&vckey)[0] = ckey;
1033 vckey = vec_splat(vckey, 0);
1034 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1035 vrgbmask = vec_splat(vrgbmask, 0);
1036
1037 while(height--) {
1038 int width = info->d_width;
1039#define ONE_PIXEL_BLEND(condition, widthvar) \
1040 while (condition) { \
1041 Uint32 Pixel; \
1042 unsigned sR, sG, sB, dR, dG, dB; \
1043 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1044 if(sA && Pixel != ckey) { \
1045 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1046 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1047 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1048 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1049 } \
1050 dstp++; \
1051 srcp++; \
1052 widthvar--; \
1053 }
1054 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1055 if (width > 0) {
1056 int extrawidth = (width % 4);
1057 vector unsigned char valigner = VEC_ALIGNER(srcp);
1058 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1059 width -= extrawidth;
1060 while (width) {
1061 vector unsigned char vsel;
1062 vector unsigned char voverflow;
1063 vector unsigned char vd;
1064 vector unsigned char vd_orig;
1065
1066 /* s = *srcp */
1067 voverflow = (vector unsigned char)vec_ld(15, srcp);
1068 vs = vec_perm(vs, voverflow, valigner);
1069
1070 /* vsel is set for items that match the key */
1071 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1072 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1073
1074 /* permute to source format */
1075 vs = vec_perm(vs, valpha, vsrcPermute);
1076
1077 /* d = *dstp */
1078 vd = (vector unsigned char)vec_ld(0, dstp);
1079 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1080
1081 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1082
1083 /* set the alpha channel to full on */
1084 vd = vec_or(vd, valphamask);
1085
1086 /* mask out color key */
1087 vd = vec_sel(vd, vd_orig, vsel);
1088
1089 /* permute to dest format */
1090 vd = vec_perm(vd, vbits, vdstPermute);
1091
1092 /* *dstp = res */
1093 vec_st((vector unsigned int)vd, 0, dstp);
1094
1095 srcp += 4;
1096 dstp += 4;
1097 width -= 4;
1098 vs = voverflow;
1099 }
1100 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1101 }
1102#undef ONE_PIXEL_BLEND
1103
1104 srcp += srcskip;
1105 dstp += dstskip;
1106 }
1107}
1108
1109
1110static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1111{
1112 int width = info->d_width;
1113 int height = info->d_height;
1114 Uint32 *srcp = (Uint32 *)info->s_pixels;
1115 int srcskip = info->s_skip >> 2;
1116 Uint32 *dstp = (Uint32 *)info->d_pixels;
1117 int dstskip = info->d_skip >> 2;
1118 SDL_PixelFormat *srcfmt = info->src;
1119 SDL_PixelFormat *dstfmt = info->dst;
1120 vector unsigned char mergePermute;
1121 vector unsigned char valphaPermute;
1122 vector unsigned char vsrcPermute;
1123 vector unsigned char vdstPermute;
1124 vector unsigned char vsdstPermute;
1125 vector unsigned char valphamask;
1126 vector unsigned char vpixelmask;
1127 vector unsigned char v0;
1128 vector unsigned short v1;
1129 vector unsigned short v8;
1130
1131 v0 = vec_splat_u8(0);
1132 v1 = vec_splat_u16(1);
1133 v8 = vec_splat_u16(8);
1134 mergePermute = VEC_MERGE_PERMUTE();
1135 valphamask = VEC_ALPHA_MASK();
1136 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1137 vpixelmask = vec_nor(valphamask, v0);
1138 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1139 vdstPermute = calc_swizzle32(NULL, dstfmt);
1140 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1141
1142 while ( height-- ) {
1143 width = info->d_width;
1144#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1145 Uint32 Pixel; \
1146 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1147 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1148 if(sA) { \
1149 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1150 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1151 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1152 } \
1153 ++srcp; \
1154 ++dstp; \
1155 widthvar--; \
1156 }
1157 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1158 if (width > 0) {
1159 /* vsrcPermute */
1160 /* vdstPermute */
1161 int extrawidth = (width % 4);
1162 vector unsigned char valigner = VEC_ALIGNER(srcp);
1163 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1164 width -= extrawidth;
1165 while (width) {
1166 vector unsigned char voverflow;
1167 vector unsigned char vd;
1168 vector unsigned char valpha;
1169 vector unsigned char vdstalpha;
1170 /* s = *srcp */
1171 voverflow = (vector unsigned char)vec_ld(15, srcp);
1172 vs = vec_perm(vs, voverflow, valigner);
1173 vs = vec_perm(vs, v0, vsrcPermute);
1174
1175 valpha = vec_perm(vs, v0, valphaPermute);
1176
1177 /* d = *dstp */
1178 vd = (vector unsigned char)vec_ld(0, dstp);
1179 vd = vec_perm(vd, v0, vsdstPermute);
1180 vdstalpha = vec_and(vd, valphamask);
1181
1182 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1183
1184 /* set the alpha to the dest alpha */
1185 vd = vec_and(vd, vpixelmask);
1186 vd = vec_or(vd, vdstalpha);
1187 vd = vec_perm(vd, v0, vdstPermute);
1188
1189 /* *dstp = res */
1190 vec_st((vector unsigned int)vd, 0, dstp);
1191
1192 srcp += 4;
1193 dstp += 4;
1194 width -= 4;
1195 vs = voverflow;
1196
1197 }
1198 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1199 }
1200 srcp += srcskip;
1201 dstp += dstskip;
1202#undef ONE_PIXEL_BLEND
1203 }
1204}
1205
1206/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1207static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1208{
1209 int width = info->d_width;
1210 int height = info->d_height;
1211 Uint32 *srcp = (Uint32 *)info->s_pixels;
1212 int srcskip = info->s_skip >> 2;
1213 Uint32 *dstp = (Uint32 *)info->d_pixels;
1214 int dstskip = info->d_skip >> 2;
1215 vector unsigned char mergePermute;
1216 vector unsigned char valphaPermute;
1217 vector unsigned char valphamask;
1218 vector unsigned char vpixelmask;
1219 vector unsigned char v0;
1220 vector unsigned short v1;
1221 vector unsigned short v8;
1222 v0 = vec_splat_u8(0);
1223 v1 = vec_splat_u16(1);
1224 v8 = vec_splat_u16(8);
1225 mergePermute = VEC_MERGE_PERMUTE();
1226 valphamask = VEC_ALPHA_MASK();
1227 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1228
1229
1230 vpixelmask = vec_nor(valphamask, v0);
1231 while(height--) {
1232 width = info->d_width;
1233#define ONE_PIXEL_BLEND(condition, widthvar) \
1234 while ((condition)) { \
1235 Uint32 dalpha; \
1236 Uint32 d; \
1237 Uint32 s1; \
1238 Uint32 d1; \
1239 Uint32 s = *srcp; \
1240 Uint32 alpha = s >> 24; \
1241 if(alpha) { \
1242 if(alpha == SDL_ALPHA_OPAQUE) { \
1243 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1244 } else { \
1245 d = *dstp; \
1246 dalpha = d & 0xff000000; \
1247 s1 = s & 0xff00ff; \
1248 d1 = d & 0xff00ff; \
1249 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1250 s &= 0xff00; \
1251 d &= 0xff00; \
1252 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1253 *dstp = d1 | d | dalpha; \
1254 } \
1255 } \
1256 ++srcp; \
1257 ++dstp; \
1258 widthvar--; \
1259 }
1260 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1261 if (width > 0) {
1262 int extrawidth = (width % 4);
1263 vector unsigned char valigner = VEC_ALIGNER(srcp);
1264 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1265 width -= extrawidth;
1266 while (width) {
1267 vector unsigned char voverflow;
1268 vector unsigned char vd;
1269 vector unsigned char valpha;
1270 vector unsigned char vdstalpha;
1271 /* s = *srcp */
1272 voverflow = (vector unsigned char)vec_ld(15, srcp);
1273 vs = vec_perm(vs, voverflow, valigner);
1274
1275 valpha = vec_perm(vs, v0, valphaPermute);
1276
1277 /* d = *dstp */
1278 vd = (vector unsigned char)vec_ld(0, dstp);
1279 vdstalpha = vec_and(vd, valphamask);
1280
1281 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1282
1283 /* set the alpha to the dest alpha */
1284 vd = vec_and(vd, vpixelmask);
1285 vd = vec_or(vd, vdstalpha);
1286
1287 /* *dstp = res */
1288 vec_st((vector unsigned int)vd, 0, dstp);
1289
1290 srcp += 4;
1291 dstp += 4;
1292 width -= 4;
1293 vs = voverflow;
1294 }
1295 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1296 }
1297 srcp += srcskip;
1298 dstp += dstskip;
1299 }
1300#undef ONE_PIXEL_BLEND
1301}
1302
1303static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1304{
1305 /* XXX : 6 */
1306 unsigned alpha = info->src->alpha;
1307 int height = info->d_height;
1308 Uint32 *srcp = (Uint32 *)info->s_pixels;
1309 int srcskip = info->s_skip >> 2;
1310 Uint32 *dstp = (Uint32 *)info->d_pixels;
1311 int dstskip = info->d_skip >> 2;
1312 SDL_PixelFormat *srcfmt = info->src;
1313 SDL_PixelFormat *dstfmt = info->dst;
1314 unsigned sA = srcfmt->alpha;
1315 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1316 vector unsigned char mergePermute;
1317 vector unsigned char vsrcPermute;
1318 vector unsigned char vdstPermute;
1319 vector unsigned char vsdstPermute;
1320 vector unsigned char valpha;
1321 vector unsigned char valphamask;
1322 vector unsigned char vbits;
1323 vector unsigned short v1;
1324 vector unsigned short v8;
1325
1326 mergePermute = VEC_MERGE_PERMUTE();
1327 v1 = vec_splat_u16(1);
1328 v8 = vec_splat_u16(8);
1329
1330 /* set the alpha to 255 on the destination surf */
1331 valphamask = VEC_ALPHA_MASK();
1332
1333 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1334 vdstPermute = calc_swizzle32(NULL, dstfmt);
1335 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1336
1337 /* set a vector full of alpha and 255-alpha */
1338 ((unsigned char *)&valpha)[0] = alpha;
1339 valpha = vec_splat(valpha, 0);
1340 vbits = (vector unsigned char)vec_splat_s8(-1);
1341
1342 while(height--) {
1343 int width = info->d_width;
1344#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1345 Uint32 Pixel; \
1346 unsigned sR, sG, sB, dR, dG, dB; \
1347 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1348 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1349 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1350 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1351 ++srcp; \
1352 ++dstp; \
1353 widthvar--; \
1354 }
1355 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1356 if (width > 0) {
1357 int extrawidth = (width % 4);
1358 vector unsigned char valigner = VEC_ALIGNER(srcp);
1359 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1360 width -= extrawidth;
1361 while (width) {
1362 vector unsigned char voverflow;
1363 vector unsigned char vd;
1364
1365 /* s = *srcp */
1366 voverflow = (vector unsigned char)vec_ld(15, srcp);
1367 vs = vec_perm(vs, voverflow, valigner);
1368 vs = vec_perm(vs, valpha, vsrcPermute);
1369
1370 /* d = *dstp */
1371 vd = (vector unsigned char)vec_ld(0, dstp);
1372 vd = vec_perm(vd, vd, vsdstPermute);
1373
1374 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1375
1376 /* set the alpha channel to full on */
1377 vd = vec_or(vd, valphamask);
1378 vd = vec_perm(vd, vbits, vdstPermute);
1379
1380 /* *dstp = res */
1381 vec_st((vector unsigned int)vd, 0, dstp);
1382
1383 srcp += 4;
1384 dstp += 4;
1385 width -= 4;
1386 vs = voverflow;
1387 }
1388 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1389 }
1390#undef ONE_PIXEL_BLEND
1391
1392 srcp += srcskip;
1393 dstp += dstskip;
1394 }
1395
1396}
1397
1398
1399/* fast RGB888->(A)RGB888 blending */
1400static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1401{
1402 unsigned alpha = info->src->alpha;
1403 int height = info->d_height;
1404 Uint32 *srcp = (Uint32 *)info->s_pixels;
1405 int srcskip = info->s_skip >> 2;
1406 Uint32 *dstp = (Uint32 *)info->d_pixels;
1407 int dstskip = info->d_skip >> 2;
1408 vector unsigned char mergePermute;
1409 vector unsigned char valpha;
1410 vector unsigned char valphamask;
1411 vector unsigned short v1;
1412 vector unsigned short v8;
1413
1414 mergePermute = VEC_MERGE_PERMUTE();
1415 v1 = vec_splat_u16(1);
1416 v8 = vec_splat_u16(8);
1417
1418 /* set the alpha to 255 on the destination surf */
1419 valphamask = VEC_ALPHA_MASK();
1420
1421 /* set a vector full of alpha and 255-alpha */
1422 ((unsigned char *)&valpha)[0] = alpha;
1423 valpha = vec_splat(valpha, 0);
1424
1425 while(height--) {
1426 int width = info->d_width;
1427#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1428 Uint32 s = *srcp; \
1429 Uint32 d = *dstp; \
1430 Uint32 s1 = s & 0xff00ff; \
1431 Uint32 d1 = d & 0xff00ff; \
1432 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1433 & 0xff00ff; \
1434 s &= 0xff00; \
1435 d &= 0xff00; \
1436 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1437 *dstp = d1 | d | 0xff000000; \
1438 ++srcp; \
1439 ++dstp; \
1440 widthvar--; \
1441 }
1442 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1443 if (width > 0) {
1444 int extrawidth = (width % 4);
1445 vector unsigned char valigner = VEC_ALIGNER(srcp);
1446 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1447 width -= extrawidth;
1448 while (width) {
1449 vector unsigned char voverflow;
1450 vector unsigned char vd;
1451
1452 /* s = *srcp */
1453 voverflow = (vector unsigned char)vec_ld(15, srcp);
1454 vs = vec_perm(vs, voverflow, valigner);
1455
1456 /* d = *dstp */
1457 vd = (vector unsigned char)vec_ld(0, dstp);
1458
1459 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1460
1461 /* set the alpha channel to full on */
1462 vd = vec_or(vd, valphamask);
1463
1464 /* *dstp = res */
1465 vec_st((vector unsigned int)vd, 0, dstp);
1466
1467 srcp += 4;
1468 dstp += 4;
1469 width -= 4;
1470 vs = voverflow;
1471 }
1472 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1473 }
1474#undef ONE_PIXEL_BLEND
1475
1476 srcp += srcskip;
1477 dstp += dstskip;
1478 }
1479}
1480#if __MWERKS__
1481#pragma altivec_model off
1482#endif
1483#endif /* SDL_ALTIVEC_BLITTERS */
1484
1485/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1486static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1487{
1488 int width = info->d_width;
1489 int height = info->d_height;
1490 Uint32 *srcp = (Uint32 *)info->s_pixels;
1491 int srcskip = info->s_skip >> 2;
1492 Uint32 *dstp = (Uint32 *)info->d_pixels;
1493 int dstskip = info->d_skip >> 2;
1494
1495 while(height--) {
1496 DUFFS_LOOP4({
1497 Uint32 s = *srcp++;
1498 Uint32 d = *dstp;
1499 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1500 + (s & d & 0x00010101)) | 0xff000000;
1501 }, width);
1502 srcp += srcskip;
1503 dstp += dstskip;
1504 }
1505}
1506
1507/* fast RGB888->(A)RGB888 blending with surface alpha */
1508static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1509{
1510 unsigned alpha = info->src->alpha;
1511 if(alpha == 128) {
1512 BlitRGBtoRGBSurfaceAlpha128(info);
1513 } else {
1514 int width = info->d_width;
1515 int height = info->d_height;
1516 Uint32 *srcp = (Uint32 *)info->s_pixels;
1517 int srcskip = info->s_skip >> 2;
1518 Uint32 *dstp = (Uint32 *)info->d_pixels;
1519 int dstskip = info->d_skip >> 2;
1520 Uint32 s;
1521 Uint32 d;
1522 Uint32 s1;
1523 Uint32 d1;
1524
1525 while(height--) {
1526 DUFFS_LOOP_DOUBLE2({
1527 /* One Pixel Blend */
1528 s = *srcp;
1529 d = *dstp;
1530 s1 = s & 0xff00ff;
1531 d1 = d & 0xff00ff;
1532 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1533 & 0xff00ff;
1534 s &= 0xff00;
1535 d &= 0xff00;
1536 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1537 *dstp = d1 | d | 0xff000000;
1538 ++srcp;
1539 ++dstp;
1540 },{
1541 /* Two Pixels Blend */
1542 s = *srcp;
1543 d = *dstp;
1544 s1 = s & 0xff00ff;
1545 d1 = d & 0xff00ff;
1546 d1 += (s1 - d1) * alpha >> 8;
1547 d1 &= 0xff00ff;
1548
1549 s = ((s & 0xff00) >> 8) |
1550 ((srcp[1] & 0xff00) << 8);
1551 d = ((d & 0xff00) >> 8) |
1552 ((dstp[1] & 0xff00) << 8);
1553 d += (s - d) * alpha >> 8;
1554 d &= 0x00ff00ff;
1555
1556 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1557 ++srcp;
1558
1559 s1 = *srcp;
1560 d1 = *dstp;
1561 s1 &= 0xff00ff;
1562 d1 &= 0xff00ff;
1563 d1 += (s1 - d1) * alpha >> 8;
1564 d1 &= 0xff00ff;
1565
1566 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1567 ++srcp;
1568 ++dstp;
1569 }, width);
1570 srcp += srcskip;
1571 dstp += dstskip;
1572 }
1573 }
1574}
1575
1576/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1577static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1578{
1579 int width = info->d_width;
1580 int height = info->d_height;
1581 Uint32 *srcp = (Uint32 *)info->s_pixels;
1582 int srcskip = info->s_skip >> 2;
1583 Uint32 *dstp = (Uint32 *)info->d_pixels;
1584 int dstskip = info->d_skip >> 2;
1585
1586 while(height--) {
1587 DUFFS_LOOP4({
1588 Uint32 dalpha;
1589 Uint32 d;
1590 Uint32 s1;
1591 Uint32 d1;
1592 Uint32 s = *srcp;
1593 Uint32 alpha = s >> 24;
1594 /* FIXME: Here we special-case opaque alpha since the
1595 compositioning used (>>8 instead of /255) doesn't handle
1596 it correctly. Also special-case alpha=0 for speed?
1597 Benchmark this! */
1598 if(alpha) {
1599 if(alpha == SDL_ALPHA_OPAQUE) {
1600 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1601 } else {
1602 /*
1603 * take out the middle component (green), and process
1604 * the other two in parallel. One multiply less.
1605 */
1606 d = *dstp;
1607 dalpha = d & 0xff000000;
1608 s1 = s & 0xff00ff;
1609 d1 = d & 0xff00ff;
1610 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1611 s &= 0xff00;
1612 d &= 0xff00;
1613 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1614 *dstp = d1 | d | dalpha;
1615 }
1616 }
1617 ++srcp;
1618 ++dstp;
1619 }, width);
1620 srcp += srcskip;
1621 dstp += dstskip;
1622 }
1623}
1624
1625#if GCC_ASMBLIT
1626/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1627static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1628{
1629 int width = info->d_width;
1630 int height = info->d_height;
1631 Uint32 *srcp = (Uint32 *)info->s_pixels;
1632 int srcskip = info->s_skip >> 2;
1633 Uint32 *dstp = (Uint32 *)info->d_pixels;
1634 int dstskip = info->d_skip >> 2;
1635 SDL_PixelFormat* sf = info->src;
1636 Uint32 amask = sf->Amask;
1637
1638 __asm__ (
1639 /* make mm6 all zeros. */
1640 "pxor %%mm6, %%mm6\n"
1641
1642 /* Make a mask to preserve the alpha. */
1643 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1644 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1645 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1646 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1647 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1648
1649 /* form channel masks */
1650 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1651 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1652 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1653 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1654
1655 /* get alpha channel shift */
1656 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1657
1658 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1659
1660 while(height--) {
1661
1662 DUFFS_LOOP4({
1663 Uint32 alpha;
1664
1665 __asm__ (
1666 "prefetch 64(%0)\n"
1667 "prefetch 64(%1)\n"
1668 : : "r" (srcp), "r" (dstp) );
1669
1670 alpha = *srcp & amask;
1671 /* FIXME: Here we special-case opaque alpha since the
1672 compositioning used (>>8 instead of /255) doesn't handle
1673 it correctly. Also special-case alpha=0 for speed?
1674 Benchmark this! */
1675 if(alpha == 0) {
1676 /* do nothing */
1677 }
1678 else if(alpha == amask) {
1679 /* opaque alpha -- copy RGB, keep dst alpha */
1680 /* using MMX here to free up regular registers for other things */
1681 __asm__ (
1682 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1683 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1684 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1685 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1686 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1687 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1688
1689 : : "r" (srcp), "r" (dstp) );
1690 }
1691
1692 else {
1693 __asm__ (
1694 /* load in the source, and dst. */
1695 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1696 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1697
1698 /* Move the src alpha into mm2 */
1699
1700 /* if supporting pshufw */
1701 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1702 /*"psrlw $8, %%mm2\n" */
1703
1704 /* else: */
1705 "movd %2, %%mm2\n"
1706 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1707 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1708 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1709 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1710
1711 /* move the colors into words. */
1712 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1713 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1714
1715 /* src - dst */
1716 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1717
1718 /* A * (src-dst) */
1719 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1720 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1721 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1722
1723 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1724
1725 "movd %%mm0, (%1)\n" /* result in mm0 */
1726
1727 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1728
1729 }
1730 ++srcp;
1731 ++dstp;
1732 }, width);
1733 srcp += srcskip;
1734 dstp += dstskip;
1735 }
1736
1737 __asm__ (
1738 "emms\n"
1739 : );
1740}
1741/* End GCC_ASMBLIT*/
1742
1743#elif MSVC_ASMBLIT
1744/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1745static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1746{
1747 int width = info->d_width;
1748 int height = info->d_height;
1749 Uint32 *srcp = (Uint32 *)info->s_pixels;
1750 int srcskip = info->s_skip >> 2;
1751 Uint32 *dstp = (Uint32 *)info->d_pixels;
1752 int dstskip = info->d_skip >> 2;
1753 SDL_PixelFormat* sf = info->src;
1754 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1755 Uint32 amask = sf->Amask;
1756 Uint32 ashift = sf->Ashift;
1757 Uint64 multmask;
1758
1759 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1760
1761 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1762 multmask = ~(0xFFFFi64 << (ashift * 2));
1763 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1764
1765 while(height--) {
1766 DUFFS_LOOP4({
1767 Uint32 alpha;
1768
1769 _m_prefetch(srcp + 16);
1770 _m_prefetch(dstp + 16);
1771
1772 alpha = *srcp & amask;
1773 if (alpha == 0) {
1774 /* do nothing */
1775 } else if (alpha == amask) {
1776 /* copy RGB, keep dst alpha */
1777 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1778 } else {
1779 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1780 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1781
1782 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1783 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1784
1785 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1786 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1787 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1788 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1789 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1790
1791 /* blend */
1792 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1793 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1794 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1795 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1796 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1797
1798 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1799 }
1800 ++srcp;
1801 ++dstp;
1802 }, width);
1803 srcp += srcskip;
1804 dstp += dstskip;
1805 }
1806 _mm_empty();
1807}
1808/* End MSVC_ASMBLIT */
1809
1810#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1811
1812/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1813
1814/* blend a single 16 bit pixel at 50% */
1815#define BLEND16_50(d, s, mask) \
1816 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1817
1818/* blend two 16 bit pixels at 50% */
1819#define BLEND2x16_50(d, s, mask) \
1820 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1821 + (s & d & (~(mask | mask << 16))))
1822
1823static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1824{
1825 int width = info->d_width;
1826 int height = info->d_height;
1827 Uint16 *srcp = (Uint16 *)info->s_pixels;
1828 int srcskip = info->s_skip >> 1;
1829 Uint16 *dstp = (Uint16 *)info->d_pixels;
1830 int dstskip = info->d_skip >> 1;
1831
1832 while(height--) {
1833 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1834 /*
1835 * Source and destination not aligned, pipeline it.
1836 * This is mostly a win for big blits but no loss for
1837 * small ones
1838 */
1839 Uint32 prev_sw;
1840 int w = width;
1841
1842 /* handle odd destination */
1843 if((uintptr_t)dstp & 2) {
1844 Uint16 d = *dstp, s = *srcp;
1845 *dstp = BLEND16_50(d, s, mask);
1846 dstp++;
1847 srcp++;
1848 w--;
1849 }
1850 srcp++; /* srcp is now 32-bit aligned */
1851
1852 /* bootstrap pipeline with first halfword */
1853 prev_sw = ((Uint32 *)srcp)[-1];
1854
1855 while(w > 1) {
1856 Uint32 sw, dw, s;
1857 sw = *(Uint32 *)srcp;
1858 dw = *(Uint32 *)dstp;
1859#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1860 s = (prev_sw << 16) + (sw >> 16);
1861#else
1862 s = (prev_sw >> 16) + (sw << 16);
1863#endif
1864 prev_sw = sw;
1865 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1866 dstp += 2;
1867 srcp += 2;
1868 w -= 2;
1869 }
1870
1871 /* final pixel if any */
1872 if(w) {
1873 Uint16 d = *dstp, s;
1874#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1875 s = (Uint16)prev_sw;
1876#else
1877 s = (Uint16)(prev_sw >> 16);
1878#endif
1879 *dstp = BLEND16_50(d, s, mask);
1880 srcp++;
1881 dstp++;
1882 }
1883 srcp += srcskip - 1;
1884 dstp += dstskip;
1885 } else {
1886 /* source and destination are aligned */
1887 int w = width;
1888
1889 /* first odd pixel? */
1890 if((uintptr_t)srcp & 2) {
1891 Uint16 d = *dstp, s = *srcp;
1892 *dstp = BLEND16_50(d, s, mask);
1893 srcp++;
1894 dstp++;
1895 w--;
1896 }
1897 /* srcp and dstp are now 32-bit aligned */
1898
1899 while(w > 1) {
1900 Uint32 sw = *(Uint32 *)srcp;
1901 Uint32 dw = *(Uint32 *)dstp;
1902 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1903 srcp += 2;
1904 dstp += 2;
1905 w -= 2;
1906 }
1907
1908 /* last odd pixel? */
1909 if(w) {
1910 Uint16 d = *dstp, s = *srcp;
1911 *dstp = BLEND16_50(d, s, mask);
1912 srcp++;
1913 dstp++;
1914 }
1915 srcp += srcskip;
1916 dstp += dstskip;
1917 }
1918 }
1919}
1920
1921#if GCC_ASMBLIT
1922/* fast RGB565->RGB565 blending with surface alpha */
1923static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1924{
1925 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1926 if(alpha == 128) {
1927 Blit16to16SurfaceAlpha128(info, 0xf7de);
1928 } else {
1929 int width = info->d_width;
1930 int height = info->d_height;
1931 Uint16 *srcp = (Uint16 *)info->s_pixels;
1932 int srcskip = info->s_skip >> 1;
1933 Uint16 *dstp = (Uint16 *)info->d_pixels;
1934 int dstskip = info->d_skip >> 1;
1935 Uint32 s, d;
1936 Uint64 load;
1937
1938 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1939 load = alpha;
1940 alpha >>= 3; /* downscale alpha to 5 bits */
1941
1942 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1943 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1944 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1945 /* position alpha to allow for mullo and mulhi on diff channels
1946 to reduce the number of operations */
1947 psllq_i2r(3, mm0);
1948
1949 /* Setup the 565 color channel masks */
1950 load = 0x07E007E007E007E0ULL;
1951 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1952 load = 0x001F001F001F001FULL;
1953 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1954 while(height--) {
1955 DUFFS_LOOP_QUATRO2(
1956 {
1957 s = *srcp++;
1958 d = *dstp;
1959 /*
1960 * shift out the middle component (green) to
1961 * the high 16 bits, and process all three RGB
1962 * components at the same time.
1963 */
1964 s = (s | s << 16) & 0x07e0f81f;
1965 d = (d | d << 16) & 0x07e0f81f;
1966 d += (s - d) * alpha >> 5;
1967 d &= 0x07e0f81f;
1968 *dstp++ = d | d >> 16;
1969 },{
1970 s = *srcp++;
1971 d = *dstp;
1972 /*
1973 * shift out the middle component (green) to
1974 * the high 16 bits, and process all three RGB
1975 * components at the same time.
1976 */
1977 s = (s | s << 16) & 0x07e0f81f;
1978 d = (d | d << 16) & 0x07e0f81f;
1979 d += (s - d) * alpha >> 5;
1980 d &= 0x07e0f81f;
1981 *dstp++ = d | d >> 16;
1982 s = *srcp++;
1983 d = *dstp;
1984 /*
1985 * shift out the middle component (green) to
1986 * the high 16 bits, and process all three RGB
1987 * components at the same time.
1988 */
1989 s = (s | s << 16) & 0x07e0f81f;
1990 d = (d | d << 16) & 0x07e0f81f;
1991 d += (s - d) * alpha >> 5;
1992 d &= 0x07e0f81f;
1993 *dstp++ = d | d >> 16;
1994 },{
1995 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1996 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1997
1998 /* red -- does not need a mask since the right shift clears
1999 the uninteresting bits */
2000 movq_r2r(mm2, mm5); /* src -> mm5 */
2001 movq_r2r(mm3, mm6); /* dst -> mm6 */
2002 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2003 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2004
2005 /* blend */
2006 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2007 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2008 /* alpha used is actually 11 bits
2009 11 + 5 = 16 bits, so the sign bits are lost */
2010 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2011 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2012 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2013
2014 movq_r2r(mm6, mm1); /* save new reds in dsts */
2015
2016 /* green -- process the bits in place */
2017 movq_r2r(mm2, mm5); /* src -> mm5 */
2018 movq_r2r(mm3, mm6); /* dst -> mm6 */
2019 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2020 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2021
2022 /* blend */
2023 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2024 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2025 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2026 bits are gone and the sign bits present */
2027 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2028 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2029
2030 por_r2r(mm6, mm1); /* save new greens in dsts */
2031
2032 /* blue */
2033 movq_r2r(mm2, mm5); /* src -> mm5 */
2034 movq_r2r(mm3, mm6); /* dst -> mm6 */
2035 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2036 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2037
2038 /* blend */
2039 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2040 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2041 /* 11 + 5 = 16 bits, so the sign bits are lost and
2042 the interesting bits will need to be MASKed */
2043 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2044 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2045 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2046
2047 por_r2r(mm6, mm1); /* save new blues in dsts */
2048
2049 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2050
2051 srcp += 4;
2052 dstp += 4;
2053 }, width);
2054 srcp += srcskip;
2055 dstp += dstskip;
2056 }
2057 emms();
2058 }
2059}
2060
2061/* fast RGB555->RGB555 blending with surface alpha */
2062static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2063{
2064 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2065 if(alpha == 128) {
2066 Blit16to16SurfaceAlpha128(info, 0xfbde);
2067 } else {
2068 int width = info->d_width;
2069 int height = info->d_height;
2070 Uint16 *srcp = (Uint16 *)info->s_pixels;
2071 int srcskip = info->s_skip >> 1;
2072 Uint16 *dstp = (Uint16 *)info->d_pixels;
2073 int dstskip = info->d_skip >> 1;
2074 Uint32 s, d;
2075 Uint64 load;
2076
2077 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2078 load = alpha;
2079 alpha >>= 3; /* downscale alpha to 5 bits */
2080
2081 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2082 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2083 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2084 /* position alpha to allow for mullo and mulhi on diff channels
2085 to reduce the number of operations */
2086 psllq_i2r(3, mm0);
2087
2088 /* Setup the 555 color channel masks */
2089 load = 0x03E003E003E003E0ULL;
2090 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2091 load = 0x001F001F001F001FULL;
2092 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2093 while(height--) {
2094 DUFFS_LOOP_QUATRO2(
2095 {
2096 s = *srcp++;
2097 d = *dstp;
2098 /*
2099 * shift out the middle component (green) to
2100 * the high 16 bits, and process all three RGB
2101 * components at the same time.
2102 */
2103 s = (s | s << 16) & 0x03e07c1f;
2104 d = (d | d << 16) & 0x03e07c1f;
2105 d += (s - d) * alpha >> 5;
2106 d &= 0x03e07c1f;
2107 *dstp++ = d | d >> 16;
2108 },{
2109 s = *srcp++;
2110 d = *dstp;
2111 /*
2112 * shift out the middle component (green) to
2113 * the high 16 bits, and process all three RGB
2114 * components at the same time.
2115 */
2116 s = (s | s << 16) & 0x03e07c1f;
2117 d = (d | d << 16) & 0x03e07c1f;
2118 d += (s - d) * alpha >> 5;
2119 d &= 0x03e07c1f;
2120 *dstp++ = d | d >> 16;
2121 s = *srcp++;
2122 d = *dstp;
2123 /*
2124 * shift out the middle component (green) to
2125 * the high 16 bits, and process all three RGB
2126 * components at the same time.
2127 */
2128 s = (s | s << 16) & 0x03e07c1f;
2129 d = (d | d << 16) & 0x03e07c1f;
2130 d += (s - d) * alpha >> 5;
2131 d &= 0x03e07c1f;
2132 *dstp++ = d | d >> 16;
2133 },{
2134 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2135 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2136
2137 /* red -- process the bits in place */
2138 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2139 /* by reusing the GREEN mask we free up another mmx
2140 register to accumulate the result */
2141
2142 movq_r2r(mm2, mm5); /* src -> mm5 */
2143 movq_r2r(mm3, mm6); /* dst -> mm6 */
2144 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2145 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2146
2147 /* blend */
2148 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2149 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2150 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2151 cleared by a MASK below */
2152 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2153 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2154 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2155
2156 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2157
2158 movq_r2r(mm6, mm1); /* save new reds in dsts */
2159
2160 /* green -- process the bits in place */
2161 movq_r2r(mm2, mm5); /* src -> mm5 */
2162 movq_r2r(mm3, mm6); /* dst -> mm6 */
2163 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2164 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2165
2166 /* blend */
2167 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2168 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2169 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2170 bits are gone and the sign bits present */
2171 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2172 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2173
2174 por_r2r(mm6, mm1); /* save new greens in dsts */
2175
2176 /* blue */
2177 movq_r2r(mm2, mm5); /* src -> mm5 */
2178 movq_r2r(mm3, mm6); /* dst -> mm6 */
2179 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2180 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2181
2182 /* blend */
2183 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2184 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2185 /* 11 + 5 = 16 bits, so the sign bits are lost and
2186 the interesting bits will need to be MASKed */
2187 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2188 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2189 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2190
2191 por_r2r(mm6, mm1); /* save new blues in dsts */
2192
2193 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2194
2195 srcp += 4;
2196 dstp += 4;
2197 }, width);
2198 srcp += srcskip;
2199 dstp += dstskip;
2200 }
2201 emms();
2202 }
2203}
2204/* End GCC_ASMBLIT */
2205
2206#elif MSVC_ASMBLIT
2207/* fast RGB565->RGB565 blending with surface alpha */
2208static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2209{
2210 unsigned alpha = info->src->alpha;
2211 if(alpha == 128) {
2212 Blit16to16SurfaceAlpha128(info, 0xf7de);
2213 } else {
2214 int width = info->d_width;
2215 int height = info->d_height;
2216 Uint16 *srcp = (Uint16 *)info->s_pixels;
2217 int srcskip = info->s_skip >> 1;
2218 Uint16 *dstp = (Uint16 *)info->d_pixels;
2219 int dstskip = info->d_skip >> 1;
2220 Uint32 s, d;
2221
2222 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2223
2224 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2225 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2226 alpha >>= 3; /* downscale alpha to 5 bits */
2227
2228 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2229 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2230 /* position alpha to allow for mullo and mulhi on diff channels
2231 to reduce the number of operations */
2232 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2233
2234 /* Setup the 565 color channel masks */
2235 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2236 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2237
2238 while(height--) {
2239 DUFFS_LOOP_QUATRO2(
2240 {
2241 s = *srcp++;
2242 d = *dstp;
2243 /*
2244 * shift out the middle component (green) to
2245 * the high 16 bits, and process all three RGB
2246 * components at the same time.
2247 */
2248 s = (s | s << 16) & 0x07e0f81f;
2249 d = (d | d << 16) & 0x07e0f81f;
2250 d += (s - d) * alpha >> 5;
2251 d &= 0x07e0f81f;
2252 *dstp++ = (Uint16)(d | d >> 16);
2253 },{
2254 s = *srcp++;
2255 d = *dstp;
2256 /*
2257 * shift out the middle component (green) to
2258 * the high 16 bits, and process all three RGB
2259 * components at the same time.
2260 */
2261 s = (s | s << 16) & 0x07e0f81f;
2262 d = (d | d << 16) & 0x07e0f81f;
2263 d += (s - d) * alpha >> 5;
2264 d &= 0x07e0f81f;
2265 *dstp++ = (Uint16)(d | d >> 16);
2266 s = *srcp++;
2267 d = *dstp;
2268 /*
2269 * shift out the middle component (green) to
2270 * the high 16 bits, and process all three RGB
2271 * components at the same time.
2272 */
2273 s = (s | s << 16) & 0x07e0f81f;
2274 d = (d | d << 16) & 0x07e0f81f;
2275 d += (s - d) * alpha >> 5;
2276 d &= 0x07e0f81f;
2277 *dstp++ = (Uint16)(d | d >> 16);
2278 },{
2279 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2280 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2281
2282 /* red */
2283 src2 = src1;
2284 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2285
2286 dst2 = dst1;
2287 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2288
2289 /* blend */
2290 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2291 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2292 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2293 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2294 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2295
2296 mm_res = dst2; /* RED -> mm_res */
2297
2298 /* green -- process the bits in place */
2299 src2 = src1;
2300 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2301
2302 dst2 = dst1;
2303 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2304
2305 /* blend */
2306 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2307 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2308 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2309 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2310
2311 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2312
2313 /* blue */
2314 src2 = src1;
2315 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2316
2317 dst2 = dst1;
2318 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2319
2320 /* blend */
2321 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2322 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2323 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2324 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2325 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2326
2327 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2328
2329 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2330
2331 srcp += 4;
2332 dstp += 4;
2333 }, width);
2334 srcp += srcskip;
2335 dstp += dstskip;
2336 }
2337 _mm_empty();
2338 }
2339}
2340
2341/* fast RGB555->RGB555 blending with surface alpha */
2342static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2343{
2344 unsigned alpha = info->src->alpha;
2345 if(alpha == 128) {
2346 Blit16to16SurfaceAlpha128(info, 0xfbde);
2347 } else {
2348 int width = info->d_width;
2349 int height = info->d_height;
2350 Uint16 *srcp = (Uint16 *)info->s_pixels;
2351 int srcskip = info->s_skip >> 1;
2352 Uint16 *dstp = (Uint16 *)info->d_pixels;
2353 int dstskip = info->d_skip >> 1;
2354 Uint32 s, d;
2355
2356 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2357
2358 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2359 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2360 alpha >>= 3; /* downscale alpha to 5 bits */
2361
2362 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2363 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2364 /* position alpha to allow for mullo and mulhi on diff channels
2365 to reduce the number of operations */
2366 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2367
2368 /* Setup the 555 color channel masks */
2369 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2370 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2371 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2372
2373 while(height--) {
2374 DUFFS_LOOP_QUATRO2(
2375 {
2376 s = *srcp++;
2377 d = *dstp;
2378 /*
2379 * shift out the middle component (green) to
2380 * the high 16 bits, and process all three RGB
2381 * components at the same time.
2382 */
2383 s = (s | s << 16) & 0x03e07c1f;
2384 d = (d | d << 16) & 0x03e07c1f;
2385 d += (s - d) * alpha >> 5;
2386 d &= 0x03e07c1f;
2387 *dstp++ = (Uint16)(d | d >> 16);
2388 },{
2389 s = *srcp++;
2390 d = *dstp;
2391 /*
2392 * shift out the middle component (green) to
2393 * the high 16 bits, and process all three RGB
2394 * components at the same time.
2395 */
2396 s = (s | s << 16) & 0x03e07c1f;
2397 d = (d | d << 16) & 0x03e07c1f;
2398 d += (s - d) * alpha >> 5;
2399 d &= 0x03e07c1f;
2400 *dstp++ = (Uint16)(d | d >> 16);
2401 s = *srcp++;
2402 d = *dstp;
2403 /*
2404 * shift out the middle component (green) to
2405 * the high 16 bits, and process all three RGB
2406 * components at the same time.
2407 */
2408 s = (s | s << 16) & 0x03e07c1f;
2409 d = (d | d << 16) & 0x03e07c1f;
2410 d += (s - d) * alpha >> 5;
2411 d &= 0x03e07c1f;
2412 *dstp++ = (Uint16)(d | d >> 16);
2413 },{
2414 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2415 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2416
2417 /* red -- process the bits in place */
2418 src2 = src1;
2419 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2420
2421 dst2 = dst1;
2422 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2423
2424 /* blend */
2425 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2426 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2427 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2428 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2429 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2430
2431 mm_res = dst2; /* RED -> mm_res */
2432
2433 /* green -- process the bits in place */
2434 src2 = src1;
2435 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2436
2437 dst2 = dst1;
2438 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2439
2440 /* blend */
2441 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2442 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2443 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2444 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2445
2446 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2447
2448 /* blue */
2449 src2 = src1; /* src -> src2 */
2450 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2451
2452 dst2 = dst1; /* dst -> dst2 */
2453 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2454
2455 /* blend */
2456 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2457 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2458 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2459 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2460 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2461
2462 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2463
2464 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2465
2466 srcp += 4;
2467 dstp += 4;
2468 }, width);
2469 srcp += srcskip;
2470 dstp += dstskip;
2471 }
2472 _mm_empty();
2473 }
2474}
2475#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2476
2477/* fast RGB565->RGB565 blending with surface alpha */
2478static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2479{
2480 unsigned alpha = info->src->alpha;
2481 if(alpha == 128) {
2482 Blit16to16SurfaceAlpha128(info, 0xf7de);
2483 } else {
2484 int width = info->d_width;
2485 int height = info->d_height;
2486 Uint16 *srcp = (Uint16 *)info->s_pixels;
2487 int srcskip = info->s_skip >> 1;
2488 Uint16 *dstp = (Uint16 *)info->d_pixels;
2489 int dstskip = info->d_skip >> 1;
2490 alpha >>= 3; /* downscale alpha to 5 bits */
2491
2492 while(height--) {
2493 DUFFS_LOOP4({
2494 Uint32 s = *srcp++;
2495 Uint32 d = *dstp;
2496 /*
2497 * shift out the middle component (green) to
2498 * the high 16 bits, and process all three RGB
2499 * components at the same time.
2500 */
2501 s = (s | s << 16) & 0x07e0f81f;
2502 d = (d | d << 16) & 0x07e0f81f;
2503 d += (s - d) * alpha >> 5;
2504 d &= 0x07e0f81f;
2505 *dstp++ = (Uint16)(d | d >> 16);
2506 }, width);
2507 srcp += srcskip;
2508 dstp += dstskip;
2509 }
2510 }
2511}
2512
2513/* fast RGB555->RGB555 blending with surface alpha */
2514static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2515{
2516 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2517 if(alpha == 128) {
2518 Blit16to16SurfaceAlpha128(info, 0xfbde);
2519 } else {
2520 int width = info->d_width;
2521 int height = info->d_height;
2522 Uint16 *srcp = (Uint16 *)info->s_pixels;
2523 int srcskip = info->s_skip >> 1;
2524 Uint16 *dstp = (Uint16 *)info->d_pixels;
2525 int dstskip = info->d_skip >> 1;
2526 alpha >>= 3; /* downscale alpha to 5 bits */
2527
2528 while(height--) {
2529 DUFFS_LOOP4({
2530 Uint32 s = *srcp++;
2531 Uint32 d = *dstp;
2532 /*
2533 * shift out the middle component (green) to
2534 * the high 16 bits, and process all three RGB
2535 * components at the same time.
2536 */
2537 s = (s | s << 16) & 0x03e07c1f;
2538 d = (d | d << 16) & 0x03e07c1f;
2539 d += (s - d) * alpha >> 5;
2540 d &= 0x03e07c1f;
2541 *dstp++ = (Uint16)(d | d >> 16);
2542 }, width);
2543 srcp += srcskip;
2544 dstp += dstskip;
2545 }
2546 }
2547}
2548
2549/* fast ARGB8888->RGB565 blending with pixel alpha */
2550static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2551{
2552 int width = info->d_width;
2553 int height = info->d_height;
2554 Uint32 *srcp = (Uint32 *)info->s_pixels;
2555 int srcskip = info->s_skip >> 2;
2556 Uint16 *dstp = (Uint16 *)info->d_pixels;
2557 int dstskip = info->d_skip >> 1;
2558
2559 while(height--) {
2560 DUFFS_LOOP4({
2561 Uint32 s = *srcp;
2562 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2563 /* FIXME: Here we special-case opaque alpha since the
2564 compositioning used (>>8 instead of /255) doesn't handle
2565 it correctly. Also special-case alpha=0 for speed?
2566 Benchmark this! */
2567 if(alpha) {
2568 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2569 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2570 } else {
2571 Uint32 d = *dstp;
2572 /*
2573 * convert source and destination to G0RAB65565
2574 * and blend all components at the same time
2575 */
2576 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2577 + (s >> 3 & 0x1f);
2578 d = (d | d << 16) & 0x07e0f81f;
2579 d += (s - d) * alpha >> 5;
2580 d &= 0x07e0f81f;
2581 *dstp = (Uint16)(d | d >> 16);
2582 }
2583 }
2584 srcp++;
2585 dstp++;
2586 }, width);
2587 srcp += srcskip;
2588 dstp += dstskip;
2589 }
2590}
2591
2592/* fast ARGB8888->RGB555 blending with pixel alpha */
2593static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2594{
2595 int width = info->d_width;
2596 int height = info->d_height;
2597 Uint32 *srcp = (Uint32 *)info->s_pixels;
2598 int srcskip = info->s_skip >> 2;
2599 Uint16 *dstp = (Uint16 *)info->d_pixels;
2600 int dstskip = info->d_skip >> 1;
2601
2602 while(height--) {
2603 DUFFS_LOOP4({
2604 unsigned alpha;
2605 Uint32 s = *srcp;
2606 alpha = s >> 27; /* downscale alpha to 5 bits */
2607 /* FIXME: Here we special-case opaque alpha since the
2608 compositioning used (>>8 instead of /255) doesn't handle
2609 it correctly. Also special-case alpha=0 for speed?
2610 Benchmark this! */
2611 if(alpha) {
2612 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2613 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2614 } else {
2615 Uint32 d = *dstp;
2616 /*
2617 * convert source and destination to G0RAB65565
2618 * and blend all components at the same time
2619 */
2620 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2621 + (s >> 3 & 0x1f);
2622 d = (d | d << 16) & 0x03e07c1f;
2623 d += (s - d) * alpha >> 5;
2624 d &= 0x03e07c1f;
2625 *dstp = (Uint16)(d | d >> 16);
2626 }
2627 }
2628 srcp++;
2629 dstp++;
2630 }, width);
2631 srcp += srcskip;
2632 dstp += dstskip;
2633 }
2634}
2635
2636/* General (slow) N->N blending with per-surface alpha */
2637static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2638{
2639 int width = info->d_width;
2640 int height = info->d_height;
2641 Uint8 *src = info->s_pixels;
2642 int srcskip = info->s_skip;
2643 Uint8 *dst = info->d_pixels;
2644 int dstskip = info->d_skip;
2645 SDL_PixelFormat *srcfmt = info->src;
2646 SDL_PixelFormat *dstfmt = info->dst;
2647 int srcbpp = srcfmt->BytesPerPixel;
2648 int dstbpp = dstfmt->BytesPerPixel;
2649 unsigned sA = srcfmt->alpha;
2650 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2651
2652 if(sA) {
2653 while ( height-- ) {
2654 DUFFS_LOOP4(
2655 {
2656 Uint32 Pixel;
2657 unsigned sR;
2658 unsigned sG;
2659 unsigned sB;
2660 unsigned dR;
2661 unsigned dG;
2662 unsigned dB;
2663 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2664 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2665 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2666 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2667 src += srcbpp;
2668 dst += dstbpp;
2669 },
2670 width);
2671 src += srcskip;
2672 dst += dstskip;
2673 }
2674 }
2675}
2676
2677/* General (slow) colorkeyed N->N blending with per-surface alpha */
2678static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2679{
2680 int width = info->d_width;
2681 int height = info->d_height;
2682 Uint8 *src = info->s_pixels;
2683 int srcskip = info->s_skip;
2684 Uint8 *dst = info->d_pixels;
2685 int dstskip = info->d_skip;
2686 SDL_PixelFormat *srcfmt = info->src;
2687 SDL_PixelFormat *dstfmt = info->dst;
2688 Uint32 ckey = srcfmt->colorkey;
2689 int srcbpp = srcfmt->BytesPerPixel;
2690 int dstbpp = dstfmt->BytesPerPixel;
2691 unsigned sA = srcfmt->alpha;
2692 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2693
211e4bff 2694 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2695 Uint16 *src16 = (Uint16 *)src;
2696 Uint16 *dst16 = (Uint16 *)dst;
2697 sA >>= 3; /* downscale alpha to 5 bits */
2698 while ( height-- ) {
2699 DUFFS_LOOP4(
2700 {
2701 Uint32 s;
2702 Uint32 d;
2703 s = *src16;
2704 if(sA && s != ckey) {
2705 d = *dst16;
2706 s = (s | s << 16) & 0x07e0f81f;
2707 d = (d | d << 16) & 0x07e0f81f;
2708 d += (s - d) * sA >> 5;
2709 d &= 0x07e0f81f;
2710 *dst16 = (Uint16)(d | d >> 16);
2711 }
2712 src16++;
2713 dst16++;
2714 },
2715 width);
2716 src16 += srcskip / 2;
2717 dst16 += dstskip / 2;
2718 }
2719 return;
2720 }
2721
e14743d1 2722 while ( height-- ) {
2723 DUFFS_LOOP4(
2724 {
2725 Uint32 Pixel;
2726 unsigned sR;
2727 unsigned sG;
2728 unsigned sB;
2729 unsigned dR;
2730 unsigned dG;
2731 unsigned dB;
2732 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2733 if(sA && Pixel != ckey) {
2734 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2735 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2736 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2737 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2738 }
2739 src += srcbpp;
2740 dst += dstbpp;
2741 },
2742 width);
2743 src += srcskip;
2744 dst += dstskip;
2745 }
2746}
2747
2748/* General (slow) N->N blending with pixel alpha */
2749static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2750{
2751 int width = info->d_width;
2752 int height = info->d_height;
2753 Uint8 *src = info->s_pixels;
2754 int srcskip = info->s_skip;
2755 Uint8 *dst = info->d_pixels;
2756 int dstskip = info->d_skip;
2757 SDL_PixelFormat *srcfmt = info->src;
2758 SDL_PixelFormat *dstfmt = info->dst;
2759
2760 int srcbpp;
2761 int dstbpp;
2762
2763 /* Set up some basic variables */
2764 srcbpp = srcfmt->BytesPerPixel;
2765 dstbpp = dstfmt->BytesPerPixel;
2766
2767 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2768 quite right. for <8bpp source alpha, it gets them very wrong
2769 (check all macros!)
2770 It is unclear whether there is a good general solution that doesn't
2771 need a branch (or a divide). */
2772 while ( height-- ) {
2773 DUFFS_LOOP4(
2774 {
2775 Uint32 Pixel;
2776 unsigned sR;
2777 unsigned sG;
2778 unsigned sB;
2779 unsigned dR;
2780 unsigned dG;
2781 unsigned dB;
2782 unsigned sA;
2783 unsigned dA;
2784 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2785 if(sA) {
2786 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2787 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2788 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2789 }
2790 src += srcbpp;
2791 dst += dstbpp;
2792 },
2793 width);
2794 src += srcskip;
2795 dst += dstskip;
2796 }
2797}
2798
2799
2800SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2801{
2802 SDL_PixelFormat *sf = surface->format;
2803 SDL_PixelFormat *df = surface->map->dst->format;
2804
2805 if(sf->Amask == 0) {
2806 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2807 if(df->BytesPerPixel == 1)
2808 return BlitNto1SurfaceAlphaKey;
2809 else
2810#if SDL_ALTIVEC_BLITTERS
2811 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2812 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2813 return Blit32to32SurfaceAlphaKeyAltivec;
2814 else
2815#endif
2816 return BlitNtoNSurfaceAlphaKey;
2817 } else {
2818 /* Per-surface alpha blits */
2819 switch(df->BytesPerPixel) {
2820 case 1:
2821 return BlitNto1SurfaceAlpha;
2822
2823 case 2:
2824 if(surface->map->identity) {
2825 if(df->Gmask == 0x7e0)
2826 {
2827#if MMX_ASMBLIT
2828 if(SDL_HasMMX())
2829 return Blit565to565SurfaceAlphaMMX;
2830 else
2831#endif
2832 return Blit565to565SurfaceAlpha;
2833 }
2834 else if(df->Gmask == 0x3e0)
2835 {
2836#if MMX_ASMBLIT
2837 if(SDL_HasMMX())
2838 return Blit555to555SurfaceAlphaMMX;
2839 else
2840#endif
2841 return Blit555to555SurfaceAlpha;
2842 }
2843 }
2844 return BlitNtoNSurfaceAlpha;
2845
2846 case 4:
2847 if(sf->Rmask == df->Rmask
2848 && sf->Gmask == df->Gmask
2849 && sf->Bmask == df->Bmask
2850 && sf->BytesPerPixel == 4)
2851 {
2852#if MMX_ASMBLIT
2853 if(sf->Rshift % 8 == 0
2854 && sf->Gshift % 8 == 0
2855 && sf->Bshift % 8 == 0
2856 && SDL_HasMMX())
2857 return BlitRGBtoRGBSurfaceAlphaMMX;
bdfa6989 2858#endif
2859#ifdef __ARM_NEON__
2860 if(sf->Rshift % 8 == 0
2861 && sf->Gshift % 8 == 0
2862 && sf->Bshift % 8 == 0)
c85a5291 2863 {
bdfa6989 2864 return BlitARGBtoXRGBalphaS_neon;
c85a5291 2865 }
e14743d1 2866#endif
2867 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2868 {
2869#if SDL_ALTIVEC_BLITTERS
2870 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2871 && SDL_HasAltiVec())
2872 return BlitRGBtoRGBSurfaceAlphaAltivec;
2873#endif
2874 return BlitRGBtoRGBSurfaceAlpha;
2875 }
2876 }
c85a5291 2877#ifdef __ARM_NEON__
2878 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2879 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2880 {
2881 return BlitABGRtoXRGBalphaS_neon;
2882 }
2883#endif
e14743d1 2884#if SDL_ALTIVEC_BLITTERS
2885 if((sf->BytesPerPixel == 4) &&
2886 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2887 return Blit32to32SurfaceAlphaAltivec;
2888 else
2889#endif
2890 return BlitNtoNSurfaceAlpha;
2891
2892 case 3:
2893 default:
2894 return BlitNtoNSurfaceAlpha;
2895 }
2896 }
2897 } else {
2898 /* Per-pixel alpha blits */
2899 switch(df->BytesPerPixel) {
2900 case 1:
2901 return BlitNto1PixelAlpha;
2902
2903 case 2:
2904#if SDL_ALTIVEC_BLITTERS
2905 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2906 df->Gmask == 0x7e0 &&
2907 df->Bmask == 0x1f && SDL_HasAltiVec())
2908 return Blit32to565PixelAlphaAltivec;
2909 else
2c4e54dd 2910#endif
2911#ifdef __ARM_NEON__
2912 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2913 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2914 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2915 return BlitARGBtoRGB565alpha_neon;
2916 else
2917 return BlitABGRtoRGB565alpha_neon;
2918 }
2919 else
e14743d1 2920#endif
2921 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2922 && sf->Gmask == 0xff00
2923 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2924 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2925 if(df->Gmask == 0x7e0)
2926 return BlitARGBto565PixelAlpha;
2927 else if(df->Gmask == 0x3e0)
2928 return BlitARGBto555PixelAlpha;
2929 }
2930 return BlitNtoNPixelAlpha;
2931
2932 case 4:
2933 if(sf->Rmask == df->Rmask
2934 && sf->Gmask == df->Gmask
2935 && sf->Bmask == df->Bmask
2936 && sf->BytesPerPixel == 4)
2937 {
2938#if MMX_ASMBLIT
2939 if(sf->Rshift % 8 == 0
2940 && sf->Gshift % 8 == 0
2941 && sf->Bshift % 8 == 0
2942 && sf->Ashift % 8 == 0
2943 && sf->Aloss == 0)
2944 {
2945 if(SDL_Has3DNow())
2946 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2947 if(SDL_HasMMX())
2948 return BlitRGBtoRGBPixelAlphaMMX;
2949 }
c85a5291 2950#endif
2951#ifdef __ARM_NEON__
2952 if(sf->Rshift % 8 == 0
2953 && sf->Gshift % 8 == 0
2954 && sf->Bshift % 8 == 0
2955 && sf->Ashift % 8 == 0)
2956 {
2957 return BlitARGBtoXRGBalpha_neon;
2958 }
e14743d1 2959#endif
2960 if(sf->Amask == 0xff000000)
2961 {
2962#if SDL_ALTIVEC_BLITTERS
2963 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2964 && SDL_HasAltiVec())
2965 return BlitRGBtoRGBPixelAlphaAltivec;
2966#endif
2967 return BlitRGBtoRGBPixelAlpha;
2968 }
2969 }
a1f34081 2970#ifdef __ARM_NEON__
c85a5291 2971 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2972 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2973 && sf->Amask == 0xff000000)
a1f34081 2974 {
2975 return BlitABGRtoXRGBalpha_neon;
2976 }
2977#endif
e14743d1 2978#if SDL_ALTIVEC_BLITTERS
2979 if (sf->Amask && sf->BytesPerPixel == 4 &&
2980 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2981 return Blit32to32PixelAlphaAltivec;
2982 else
2983#endif
2984 return BlitNtoNPixelAlpha;
2985
2986 case 3:
2987 default:
2988 return BlitNtoNPixelAlpha;
2989 }
2990 }
2991}
2992