tune the preloads a bit
[sdl_omap.git] / src / video / SDL_blit_A.c
CommitLineData
e14743d1 1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35# define MMX_ASMBLIT 1
36# define GCC_ASMBLIT 1
37# elif defined(_MSC_VER) && defined(_M_IX86)
38# if (_MSC_VER <= 1200)
39# include <malloc.h>
40# if defined(_mm_free)
41# define HAVE_MMINTRIN_H 1
42# endif
43# else /* Visual Studio > VC6 always has mmintrin.h */
44# define HAVE_MMINTRIN_H 1
45# endif
46# if HAVE_MMINTRIN_H
47# define MMX_ASMBLIT 1
48# define MSVC_ASMBLIT 1
49# endif
50# endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
a1f34081 64#ifdef __ARM_NEON__
65
66/* NEON optimized blitter callers */
67#define make_neon_caller(name, neon_name) \
68extern void neon_name(void *dst, const void *src, int count); \
69static void name(SDL_BlitInfo *info) \
70{ \
71 int width = info->d_width; \
72 int height = info->d_height; \
73 Uint8 *src = info->s_pixels; \
74 Uint8 *dst = info->d_pixels; \
2c4e54dd 75 int dstBpp = info->dst->BytesPerPixel; \
76 int srcstride = width * 4 + info->s_skip; \
77 int dststride = width * dstBpp + info->d_skip; \
a1f34081 78\
79 while ( height-- ) { \
199f36ec 80 __builtin_prefetch(dst + dststride); \
2c4e54dd 81 neon_name(dst, src, width); \
82 src += srcstride; \
83 dst += dststride; \
a1f34081 84 } \
85}
86
bdfa6989 87#define make_neon_callerS(name, neon_name) \
88extern void neon_name(void *dst, const void *src, int count, unsigned int alpha); \
89static void name(SDL_BlitInfo *info) \
90{ \
91 int width = info->d_width; \
92 int height = info->d_height; \
93 Uint8 *src = info->s_pixels; \
94 Uint8 *dst = info->d_pixels; \
95 int srcskip = info->s_skip; \
96 int dstskip = info->d_skip; \
97 unsigned alpha = info->src->alpha;\
98\
99 while ( height-- ) { \
100 neon_name(dst, src, width, alpha); \
101 src += width * 4 + srcskip; \
102 dst += width * 4 + dstskip; \
103 } \
104}
105
a1f34081 106make_neon_caller(BlitABGRtoXRGBalpha_neon, neon_ABGRtoXRGBalpha)
107make_neon_caller(BlitARGBtoXRGBalpha_neon, neon_ARGBtoXRGBalpha)
2c4e54dd 108make_neon_caller(BlitABGRtoRGB565alpha_neon, neon_ABGRtoRGB565alpha)
109make_neon_caller(BlitARGBtoRGB565alpha_neon, neon_ARGBtoRGB565alpha)
bdfa6989 110make_neon_callerS(BlitABGRtoXRGBalphaS_neon, neon_ABGRtoXRGBalphaS)
111make_neon_callerS(BlitARGBtoXRGBalphaS_neon, neon_ARGBtoXRGBalphaS)
a1f34081 112
113#endif /* __ARM_NEON__ */
114
e14743d1 115/* N->1 blending with per-surface alpha */
116static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
117{
118 int width = info->d_width;
119 int height = info->d_height;
120 Uint8 *src = info->s_pixels;
121 int srcskip = info->s_skip;
122 Uint8 *dst = info->d_pixels;
123 int dstskip = info->d_skip;
124 Uint8 *palmap = info->table;
125 SDL_PixelFormat *srcfmt = info->src;
126 SDL_PixelFormat *dstfmt = info->dst;
127 int srcbpp = srcfmt->BytesPerPixel;
128
129 const unsigned A = srcfmt->alpha;
130
131 while ( height-- ) {
132 DUFFS_LOOP4(
133 {
134 Uint32 Pixel;
135 unsigned sR;
136 unsigned sG;
137 unsigned sB;
138 unsigned dR;
139 unsigned dG;
140 unsigned dB;
141 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
142 dR = dstfmt->palette->colors[*dst].r;
143 dG = dstfmt->palette->colors[*dst].g;
144 dB = dstfmt->palette->colors[*dst].b;
145 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
146 dR &= 0xff;
147 dG &= 0xff;
148 dB &= 0xff;
149 /* Pack RGB into 8bit pixel */
150 if ( palmap == NULL ) {
151 *dst =((dR>>5)<<(3+2))|
152 ((dG>>5)<<(2))|
153 ((dB>>6)<<(0));
154 } else {
155 *dst = palmap[((dR>>5)<<(3+2))|
156 ((dG>>5)<<(2)) |
157 ((dB>>6)<<(0))];
158 }
159 dst++;
160 src += srcbpp;
161 },
162 width);
163 src += srcskip;
164 dst += dstskip;
165 }
166}
167
168/* N->1 blending with pixel alpha */
169static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
170{
171 int width = info->d_width;
172 int height = info->d_height;
173 Uint8 *src = info->s_pixels;
174 int srcskip = info->s_skip;
175 Uint8 *dst = info->d_pixels;
176 int dstskip = info->d_skip;
177 Uint8 *palmap = info->table;
178 SDL_PixelFormat *srcfmt = info->src;
179 SDL_PixelFormat *dstfmt = info->dst;
180 int srcbpp = srcfmt->BytesPerPixel;
181
182 /* FIXME: fix alpha bit field expansion here too? */
183 while ( height-- ) {
184 DUFFS_LOOP4(
185 {
186 Uint32 Pixel;
187 unsigned sR;
188 unsigned sG;
189 unsigned sB;
190 unsigned sA;
191 unsigned dR;
192 unsigned dG;
193 unsigned dB;
194 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
195 dR = dstfmt->palette->colors[*dst].r;
196 dG = dstfmt->palette->colors[*dst].g;
197 dB = dstfmt->palette->colors[*dst].b;
198 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
199 dR &= 0xff;
200 dG &= 0xff;
201 dB &= 0xff;
202 /* Pack RGB into 8bit pixel */
203 if ( palmap == NULL ) {
204 *dst =((dR>>5)<<(3+2))|
205 ((dG>>5)<<(2))|
206 ((dB>>6)<<(0));
207 } else {
208 *dst = palmap[((dR>>5)<<(3+2))|
209 ((dG>>5)<<(2)) |
210 ((dB>>6)<<(0)) ];
211 }
212 dst++;
213 src += srcbpp;
214 },
215 width);
216 src += srcskip;
217 dst += dstskip;
218 }
219}
220
221/* colorkeyed N->1 blending with per-surface alpha */
222static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
223{
224 int width = info->d_width;
225 int height = info->d_height;
226 Uint8 *src = info->s_pixels;
227 int srcskip = info->s_skip;
228 Uint8 *dst = info->d_pixels;
229 int dstskip = info->d_skip;
230 Uint8 *palmap = info->table;
231 SDL_PixelFormat *srcfmt = info->src;
232 SDL_PixelFormat *dstfmt = info->dst;
233 int srcbpp = srcfmt->BytesPerPixel;
234 Uint32 ckey = srcfmt->colorkey;
235
236 const int A = srcfmt->alpha;
237
238 while ( height-- ) {
239 DUFFS_LOOP(
240 {
241 Uint32 Pixel;
242 unsigned sR;
243 unsigned sG;
244 unsigned sB;
245 unsigned dR;
246 unsigned dG;
247 unsigned dB;
248 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
249 if ( Pixel != ckey ) {
250 dR = dstfmt->palette->colors[*dst].r;
251 dG = dstfmt->palette->colors[*dst].g;
252 dB = dstfmt->palette->colors[*dst].b;
253 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
254 dR &= 0xff;
255 dG &= 0xff;
256 dB &= 0xff;
257 /* Pack RGB into 8bit pixel */
258 if ( palmap == NULL ) {
259 *dst =((dR>>5)<<(3+2))|
260 ((dG>>5)<<(2)) |
261 ((dB>>6)<<(0));
262 } else {
263 *dst = palmap[((dR>>5)<<(3+2))|
264 ((dG>>5)<<(2)) |
265 ((dB>>6)<<(0)) ];
266 }
267 }
268 dst++;
269 src += srcbpp;
270 },
271 width);
272 src += srcskip;
273 dst += dstskip;
274 }
275}
276
277#if GCC_ASMBLIT
278/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
279static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
280{
281 int width = info->d_width;
282 int height = info->d_height;
283 Uint32 *srcp = (Uint32 *)info->s_pixels;
284 int srcskip = info->s_skip >> 2;
285 Uint32 *dstp = (Uint32 *)info->d_pixels;
286 int dstskip = info->d_skip >> 2;
287 Uint32 dalpha = info->dst->Amask;
288 Uint64 load;
289
290 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
291 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
292 load = 0x0001010100010101ULL;/* !alpha128 mask */
293 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
294 movd_m2r(dalpha, mm7); /* dst alpha mask */
295 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
296 while(height--) {
297 DUFFS_LOOP_DOUBLE2(
298 {
299 Uint32 s = *srcp++;
300 Uint32 d = *dstp;
301 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
302 + (s & d & 0x00010101)) | dalpha;
303 },{
304 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
305 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
306
307 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
308 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
309
310 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
311 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
312 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
313 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
314 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
315 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
316 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
317
318 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
319 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
320 dstp += 2;
321 srcp += 2;
322 }, width);
323 srcp += srcskip;
324 dstp += dstskip;
325 }
326 emms();
327}
328
329/* fast RGB888->(A)RGB888 blending with surface alpha */
330static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
331{
332 SDL_PixelFormat* df = info->dst;
333 unsigned alpha = info->src->alpha;
334
335 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
336 /* only call a128 version when R,G,B occupy lower bits */
337 BlitRGBtoRGBSurfaceAlpha128MMX(info);
338 } else {
339 int width = info->d_width;
340 int height = info->d_height;
341 Uint32 *srcp = (Uint32 *)info->s_pixels;
342 int srcskip = info->s_skip >> 2;
343 Uint32 *dstp = (Uint32 *)info->d_pixels;
344 int dstskip = info->d_skip >> 2;
345
346 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
347 /* form the alpha mult */
348 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
349 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
350 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
351 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
352 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
353 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
354 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
355 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
356 movd_m2r(df->Amask, mm7); /* dst alpha mask */
357 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
358
359 while(height--) {
360 DUFFS_LOOP_DOUBLE2({
361 /* One Pixel Blend */
362 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
363 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
364 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
365 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
366
367 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
368 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
369 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
370 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
371
372 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
373 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
374 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
375 ++srcp;
376 ++dstp;
377 },{
378 /* Two Pixels Blend */
379 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
380 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
381 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
382 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
383
384 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
385 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
386 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
387 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
388
389 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
390 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
391 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
392 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
393
394 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
395 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
396 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
397 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
398
399 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
400 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
401
402 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
403
404 srcp += 2;
405 dstp += 2;
406 }, width);
407 srcp += srcskip;
408 dstp += dstskip;
409 }
410 emms();
411 }
412}
413
414/* fast ARGB888->(A)RGB888 blending with pixel alpha */
415static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
416{
417 int width = info->d_width;
418 int height = info->d_height;
419 Uint32 *srcp = (Uint32 *)info->s_pixels;
420 int srcskip = info->s_skip >> 2;
421 Uint32 *dstp = (Uint32 *)info->d_pixels;
422 int dstskip = info->d_skip >> 2;
423 SDL_PixelFormat* sf = info->src;
424 Uint32 amask = sf->Amask;
425
426 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
427 /* form multiplication mask */
428 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
429 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
430 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
431 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
432 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
433 /* form channel masks */
434 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
435 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
436 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
437 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
438 /* get alpha channel shift */
439 __asm__ __volatile__ (
440 "movd %0, %%mm5"
441 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
442
443 while(height--) {
444 DUFFS_LOOP4({
445 Uint32 alpha = *srcp & amask;
446 /* FIXME: Here we special-case opaque alpha since the
447 compositioning used (>>8 instead of /255) doesn't handle
448 it correctly. Also special-case alpha=0 for speed?
449 Benchmark this! */
450 if(alpha == 0) {
451 /* do nothing */
452 } else if(alpha == amask) {
453 /* opaque alpha -- copy RGB, keep dst alpha */
454 /* using MMX here to free up regular registers for other things */
455 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
456 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
457 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
458 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
459 por_r2r(mm1, mm2); /* src | dst -> mm2 */
460 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
461 } else {
462 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
463 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
464
465 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
466 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
467
468 __asm__ __volatile__ (
469 "movd %0, %%mm4"
470 : : "r" (alpha) ); /* 0000A000 -> mm4 */
471 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
472 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
473 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
474 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
475
476 /* blend */
477 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
478 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
479 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
480 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
481
482 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
483 movd_r2m(mm2, *dstp);/* mm2 -> dst */
484 }
485 ++srcp;
486 ++dstp;
487 }, width);
488 srcp += srcskip;
489 dstp += dstskip;
490 }
491 emms();
492}
493/* End GCC_ASMBLIT */
494
495#elif MSVC_ASMBLIT
496/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
497static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
498{
499 int width = info->d_width;
500 int height = info->d_height;
501 Uint32 *srcp = (Uint32 *)info->s_pixels;
502 int srcskip = info->s_skip >> 2;
503 Uint32 *dstp = (Uint32 *)info->d_pixels;
504 int dstskip = info->d_skip >> 2;
505 Uint32 dalpha = info->dst->Amask;
506
507 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
508
509 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
510 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
511 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
512
513 while (height--) {
514 int n = width;
515 if ( n & 1 ) {
516 Uint32 s = *srcp++;
517 Uint32 d = *dstp;
518 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
519 + (s & d & 0x00010101)) | dalpha;
520 n--;
521 }
522
523 for (n >>= 1; n > 0; --n) {
524 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
525 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
526
527 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
528 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
529
530 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
531 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
532 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
533 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
534
535 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
536 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
537 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
538 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
539
540 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
541 dstp += 2;
542 srcp += 2;
543 }
544
545 srcp += srcskip;
546 dstp += dstskip;
547 }
548 _mm_empty();
549}
550
551/* fast RGB888->(A)RGB888 blending with surface alpha */
552static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
553{
554 SDL_PixelFormat* df = info->dst;
555 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
556 unsigned alpha = info->src->alpha;
557
558 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
559 /* only call a128 version when R,G,B occupy lower bits */
560 BlitRGBtoRGBSurfaceAlpha128MMX(info);
561 } else {
562 int width = info->d_width;
563 int height = info->d_height;
564 Uint32 *srcp = (Uint32 *)info->s_pixels;
565 int srcskip = info->s_skip >> 2;
566 Uint32 *dstp = (Uint32 *)info->d_pixels;
567 int dstskip = info->d_skip >> 2;
568 Uint32 dalpha = df->Amask;
569 Uint32 amult;
570
571 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
572
573 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
574 /* form the alpha mult */
575 amult = alpha | (alpha << 8);
576 amult = amult | (amult << 16);
577 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
578 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
579 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
580 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
581 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
582
583 while (height--) {
584 int n = width;
585 if (n & 1) {
586 /* One Pixel Blend */
587 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
588 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
589
590 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
591 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
592
593 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
594 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
595 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
596 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
597
598 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
599 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
600 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
601
602 ++srcp;
603 ++dstp;
604
605 n--;
606 }
607
608 for (n >>= 1; n > 0; --n) {
609 /* Two Pixels Blend */
610 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
611 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
612 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
613 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
614
615 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
616 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
617 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
618 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
619
620 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
621 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
622 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
623 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
624
625 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
626 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
627 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
628 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
629
630 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
631 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
632
633 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
634
635 srcp += 2;
636 dstp += 2;
637 }
638 srcp += srcskip;
639 dstp += dstskip;
640 }
641 _mm_empty();
642 }
643}
644
645/* fast ARGB888->(A)RGB888 blending with pixel alpha */
646static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
647{
648 int width = info->d_width;
649 int height = info->d_height;
650 Uint32 *srcp = (Uint32 *)info->s_pixels;
651 int srcskip = info->s_skip >> 2;
652 Uint32 *dstp = (Uint32 *)info->d_pixels;
653 int dstskip = info->d_skip >> 2;
654 SDL_PixelFormat* sf = info->src;
655 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
656 Uint32 amask = sf->Amask;
657 Uint32 ashift = sf->Ashift;
658 Uint64 multmask;
659
660 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
661
662 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
663 multmask = ~(0xFFFFi64 << (ashift * 2));
664 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
665
666 while(height--) {
667 DUFFS_LOOP4({
668 Uint32 alpha = *srcp & amask;
669 if (alpha == 0) {
670 /* do nothing */
671 } else if (alpha == amask) {
672 /* opaque alpha -- copy RGB, keep dst alpha */
673 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
674 } else {
675 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
676 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
677
678 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
679 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
680
681 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
682 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
683 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
684 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
685 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
686
687 /* blend */
688 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
689 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
690 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
691 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
692 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
693
694 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
695 }
696 ++srcp;
697 ++dstp;
698 }, width);
699 srcp += srcskip;
700 dstp += dstskip;
701 }
702 _mm_empty();
703}
704/* End MSVC_ASMBLIT */
705
706#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
707
708#if SDL_ALTIVEC_BLITTERS
709#if __MWERKS__
710#pragma altivec_model on
711#endif
712#if HAVE_ALTIVEC_H
713#include <altivec.h>
714#endif
715#include <assert.h>
716
717#if (defined(__MACOSX__) && (__GNUC__ < 4))
718 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
719 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
720 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
721 (vector unsigned short) ( a,b,c,d,e,f,g,h )
722#else
723 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
724 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
725 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
726 (vector unsigned short) { a,b,c,d,e,f,g,h }
727#endif
728
729#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
730#define VECPRINT(msg, v) do { \
731 vector unsigned int tmpvec = (vector unsigned int)(v); \
732 unsigned int *vp = (unsigned int *)&tmpvec; \
733 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
734} while (0)
735
736/* the permuation vector that takes the high bytes out of all the appropriate shorts
737 (vector unsigned char)(
738 0x00, 0x10, 0x02, 0x12,
739 0x04, 0x14, 0x06, 0x16,
740 0x08, 0x18, 0x0A, 0x1A,
741 0x0C, 0x1C, 0x0E, 0x1E );
742*/
743#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
744#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
745#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
746#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
747 ? vec_lvsl(0, src) \
748 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
749
750
751#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
752 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
753 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
754 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
755 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
756 /* valpha2 is 255-alpha */ \
757 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
758 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
759 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
760 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
761 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
762 /* add source and dest */ \
763 vtemp1 = vec_add(vtemp1, vtemp3); \
764 vtemp2 = vec_add(vtemp2, vtemp4); \
765 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
766 vtemp1 = vec_add(vtemp1, v1_16); \
767 vtemp3 = vec_sr(vtemp1, v8_16); \
768 vtemp1 = vec_add(vtemp1, vtemp3); \
769 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
770 vtemp2 = vec_add(vtemp2, v1_16); \
771 vtemp4 = vec_sr(vtemp2, v8_16); \
772 vtemp2 = vec_add(vtemp2, vtemp4); \
773 /* (>>8) and get ARGBARGBARGBARGB */ \
774 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
775} while (0)
776
777/* Calculate the permute vector used for 32->32 swizzling */
778static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
779 const SDL_PixelFormat *dstfmt)
780{
781 /*
782 * We have to assume that the bits that aren't used by other
783 * colors is alpha, and it's one complete byte, since some formats
784 * leave alpha with a zero mask, but we should still swizzle the bits.
785 */
786 /* ARGB */
787 const static struct SDL_PixelFormat default_pixel_format = {
788 NULL, 0, 0,
789 0, 0, 0, 0,
790 16, 8, 0, 24,
791 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
792 0, 0};
793 if (!srcfmt) {
794 srcfmt = &default_pixel_format;
795 }
796 if (!dstfmt) {
797 dstfmt = &default_pixel_format;
798 }
799 const vector unsigned char plus = VECUINT8_LITERAL
800 ( 0x00, 0x00, 0x00, 0x00,
801 0x04, 0x04, 0x04, 0x04,
802 0x08, 0x08, 0x08, 0x08,
803 0x0C, 0x0C, 0x0C, 0x0C );
804 vector unsigned char vswiz;
805 vector unsigned int srcvec;
806#define RESHIFT(X) (3 - ((X) >> 3))
807 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
808 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
809 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
810 Uint32 amask;
811 /* Use zero for alpha if either surface doesn't have alpha */
812 if (dstfmt->Amask) {
813 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
814 } else {
815 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
816 }
817#undef RESHIFT
818 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
819 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
820 return(vswiz);
821}
822
823static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
824{
825 int height = info->d_height;
826 Uint8 *src = (Uint8 *)info->s_pixels;
827 int srcskip = info->s_skip;
828 Uint8 *dst = (Uint8 *)info->d_pixels;
829 int dstskip = info->d_skip;
830 SDL_PixelFormat *srcfmt = info->src;
831
832 vector unsigned char v0 = vec_splat_u8(0);
833 vector unsigned short v8_16 = vec_splat_u16(8);
834 vector unsigned short v1_16 = vec_splat_u16(1);
835 vector unsigned short v2_16 = vec_splat_u16(2);
836 vector unsigned short v3_16 = vec_splat_u16(3);
837 vector unsigned int v8_32 = vec_splat_u32(8);
838 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
839 vector unsigned short v3f = VECUINT16_LITERAL(
840 0x003f, 0x003f, 0x003f, 0x003f,
841 0x003f, 0x003f, 0x003f, 0x003f);
842 vector unsigned short vfc = VECUINT16_LITERAL(
843 0x00fc, 0x00fc, 0x00fc, 0x00fc,
844 0x00fc, 0x00fc, 0x00fc, 0x00fc);
845
846 /*
847 0x10 - 0x1f is the alpha
848 0x00 - 0x0e evens are the red
849 0x01 - 0x0f odds are zero
850 */
851 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
852 0x10, 0x00, 0x01, 0x01,
853 0x10, 0x02, 0x01, 0x01,
854 0x10, 0x04, 0x01, 0x01,
855 0x10, 0x06, 0x01, 0x01
856 );
857 vector unsigned char vredalpha2 = (vector unsigned char)(
858 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
859 );
860 /*
861 0x00 - 0x0f is ARxx ARxx ARxx ARxx
862 0x11 - 0x0f odds are blue
863 */
864 vector unsigned char vblue1 = VECUINT8_LITERAL(
865 0x00, 0x01, 0x02, 0x11,
866 0x04, 0x05, 0x06, 0x13,
867 0x08, 0x09, 0x0a, 0x15,
868 0x0c, 0x0d, 0x0e, 0x17
869 );
870 vector unsigned char vblue2 = (vector unsigned char)(
871 vec_add((vector unsigned int)vblue1, v8_32)
872 );
873 /*
874 0x00 - 0x0f is ARxB ARxB ARxB ARxB
875 0x10 - 0x0e evens are green
876 */
877 vector unsigned char vgreen1 = VECUINT8_LITERAL(
878 0x00, 0x01, 0x10, 0x03,
879 0x04, 0x05, 0x12, 0x07,
880 0x08, 0x09, 0x14, 0x0b,
881 0x0c, 0x0d, 0x16, 0x0f
882 );
883 vector unsigned char vgreen2 = (vector unsigned char)(
884 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
885 );
886 vector unsigned char vgmerge = VECUINT8_LITERAL(
887 0x00, 0x02, 0x00, 0x06,
888 0x00, 0x0a, 0x00, 0x0e,
889 0x00, 0x12, 0x00, 0x16,
890 0x00, 0x1a, 0x00, 0x1e);
891 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
892 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
893 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
894
895 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
896 vf800 = vec_sl(vf800, vec_splat_u16(8));
897
898 while(height--) {
899 int extrawidth;
900 vector unsigned char valigner;
901 vector unsigned char vsrc;
902 vector unsigned char voverflow;
903 int width = info->d_width;
904
905#define ONE_PIXEL_BLEND(condition, widthvar) \
906 while (condition) { \
907 Uint32 Pixel; \
908 unsigned sR, sG, sB, dR, dG, dB, sA; \
909 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
910 if(sA) { \
911 unsigned short dstpixel = *((unsigned short *)dst); \
912 dR = (dstpixel >> 8) & 0xf8; \
913 dG = (dstpixel >> 3) & 0xfc; \
914 dB = (dstpixel << 3) & 0xf8; \
915 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
916 *((unsigned short *)dst) = ( \
917 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
918 ); \
919 } \
920 src += 4; \
921 dst += 2; \
922 widthvar--; \
923 }
924 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
925 extrawidth = (width % 8);
926 valigner = VEC_ALIGNER(src);
927 vsrc = (vector unsigned char)vec_ld(0, src);
928 width -= extrawidth;
929 while (width) {
930 vector unsigned char valpha;
931 vector unsigned char vsrc1, vsrc2;
932 vector unsigned char vdst1, vdst2;
933 vector unsigned short vR, vG, vB;
934 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
935
936 /* Load 8 pixels from src as ARGB */
937 voverflow = (vector unsigned char)vec_ld(15, src);
938 vsrc = vec_perm(vsrc, voverflow, valigner);
939 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
940 src += 16;
941 vsrc = (vector unsigned char)vec_ld(15, src);
942 voverflow = vec_perm(voverflow, vsrc, valigner);
943 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
944 src += 16;
945
946 /* Load 8 pixels from dst as XRGB */
947 voverflow = vec_ld(0, dst);
948 vR = vec_and((vector unsigned short)voverflow, vf800);
949 vB = vec_sl((vector unsigned short)voverflow, v3_16);
950 vG = vec_sl(vB, v2_16);
951 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
952 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
953 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
954 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
955 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
956 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
957
958 /* Alpha blend 8 pixels as ARGB */
959 valpha = vec_perm(vsrc1, v0, valphaPermute);
960 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
961 valpha = vec_perm(vsrc2, v0, valphaPermute);
962 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
963
964 /* Convert 8 pixels to 565 */
965 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
966 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
967 vgpixel = vec_and(vgpixel, vfc);
968 vgpixel = vec_sl(vgpixel, v3_16);
969 vrpixel = vec_sl(vpixel, v1_16);
970 vrpixel = vec_and(vrpixel, vf800);
971 vbpixel = vec_and(vpixel, v3f);
972 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
973 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
974
975 /* Store 8 pixels */
976 vec_st(vdst1, 0, dst);
977
978 width -= 8;
979 dst += 16;
980 }
981 ONE_PIXEL_BLEND((extrawidth), extrawidth);
982#undef ONE_PIXEL_BLEND
983 src += srcskip;
984 dst += dstskip;
985 }
986}
987
988static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
989{
990 unsigned alpha = info->src->alpha;
991 int height = info->d_height;
992 Uint32 *srcp = (Uint32 *)info->s_pixels;
993 int srcskip = info->s_skip >> 2;
994 Uint32 *dstp = (Uint32 *)info->d_pixels;
995 int dstskip = info->d_skip >> 2;
996 SDL_PixelFormat *srcfmt = info->src;
997 SDL_PixelFormat *dstfmt = info->dst;
998 unsigned sA = srcfmt->alpha;
999 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1000 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
1001 Uint32 ckey = info->src->colorkey;
1002 vector unsigned char mergePermute;
1003 vector unsigned char vsrcPermute;
1004 vector unsigned char vdstPermute;
1005 vector unsigned char vsdstPermute;
1006 vector unsigned char valpha;
1007 vector unsigned char valphamask;
1008 vector unsigned char vbits;
1009 vector unsigned char v0;
1010 vector unsigned short v1;
1011 vector unsigned short v8;
1012 vector unsigned int vckey;
1013 vector unsigned int vrgbmask;
1014
1015 mergePermute = VEC_MERGE_PERMUTE();
1016 v0 = vec_splat_u8(0);
1017 v1 = vec_splat_u16(1);
1018 v8 = vec_splat_u16(8);
1019
1020 /* set the alpha to 255 on the destination surf */
1021 valphamask = VEC_ALPHA_MASK();
1022
1023 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1024 vdstPermute = calc_swizzle32(NULL, dstfmt);
1025 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1026
1027 /* set a vector full of alpha and 255-alpha */
1028 ((unsigned char *)&valpha)[0] = alpha;
1029 valpha = vec_splat(valpha, 0);
1030 vbits = (vector unsigned char)vec_splat_s8(-1);
1031
1032 ckey &= rgbmask;
1033 ((unsigned int *)(char*)&vckey)[0] = ckey;
1034 vckey = vec_splat(vckey, 0);
1035 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
1036 vrgbmask = vec_splat(vrgbmask, 0);
1037
1038 while(height--) {
1039 int width = info->d_width;
1040#define ONE_PIXEL_BLEND(condition, widthvar) \
1041 while (condition) { \
1042 Uint32 Pixel; \
1043 unsigned sR, sG, sB, dR, dG, dB; \
1044 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
1045 if(sA && Pixel != ckey) { \
1046 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
1047 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1048 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1049 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1050 } \
1051 dstp++; \
1052 srcp++; \
1053 widthvar--; \
1054 }
1055 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1056 if (width > 0) {
1057 int extrawidth = (width % 4);
1058 vector unsigned char valigner = VEC_ALIGNER(srcp);
1059 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1060 width -= extrawidth;
1061 while (width) {
1062 vector unsigned char vsel;
1063 vector unsigned char voverflow;
1064 vector unsigned char vd;
1065 vector unsigned char vd_orig;
1066
1067 /* s = *srcp */
1068 voverflow = (vector unsigned char)vec_ld(15, srcp);
1069 vs = vec_perm(vs, voverflow, valigner);
1070
1071 /* vsel is set for items that match the key */
1072 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1073 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1074
1075 /* permute to source format */
1076 vs = vec_perm(vs, valpha, vsrcPermute);
1077
1078 /* d = *dstp */
1079 vd = (vector unsigned char)vec_ld(0, dstp);
1080 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1081
1082 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1083
1084 /* set the alpha channel to full on */
1085 vd = vec_or(vd, valphamask);
1086
1087 /* mask out color key */
1088 vd = vec_sel(vd, vd_orig, vsel);
1089
1090 /* permute to dest format */
1091 vd = vec_perm(vd, vbits, vdstPermute);
1092
1093 /* *dstp = res */
1094 vec_st((vector unsigned int)vd, 0, dstp);
1095
1096 srcp += 4;
1097 dstp += 4;
1098 width -= 4;
1099 vs = voverflow;
1100 }
1101 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1102 }
1103#undef ONE_PIXEL_BLEND
1104
1105 srcp += srcskip;
1106 dstp += dstskip;
1107 }
1108}
1109
1110
1111static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1112{
1113 int width = info->d_width;
1114 int height = info->d_height;
1115 Uint32 *srcp = (Uint32 *)info->s_pixels;
1116 int srcskip = info->s_skip >> 2;
1117 Uint32 *dstp = (Uint32 *)info->d_pixels;
1118 int dstskip = info->d_skip >> 2;
1119 SDL_PixelFormat *srcfmt = info->src;
1120 SDL_PixelFormat *dstfmt = info->dst;
1121 vector unsigned char mergePermute;
1122 vector unsigned char valphaPermute;
1123 vector unsigned char vsrcPermute;
1124 vector unsigned char vdstPermute;
1125 vector unsigned char vsdstPermute;
1126 vector unsigned char valphamask;
1127 vector unsigned char vpixelmask;
1128 vector unsigned char v0;
1129 vector unsigned short v1;
1130 vector unsigned short v8;
1131
1132 v0 = vec_splat_u8(0);
1133 v1 = vec_splat_u16(1);
1134 v8 = vec_splat_u16(8);
1135 mergePermute = VEC_MERGE_PERMUTE();
1136 valphamask = VEC_ALPHA_MASK();
1137 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1138 vpixelmask = vec_nor(valphamask, v0);
1139 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1140 vdstPermute = calc_swizzle32(NULL, dstfmt);
1141 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1142
1143 while ( height-- ) {
1144 width = info->d_width;
1145#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1146 Uint32 Pixel; \
1147 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1148 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1149 if(sA) { \
1150 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1151 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1152 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1153 } \
1154 ++srcp; \
1155 ++dstp; \
1156 widthvar--; \
1157 }
1158 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1159 if (width > 0) {
1160 /* vsrcPermute */
1161 /* vdstPermute */
1162 int extrawidth = (width % 4);
1163 vector unsigned char valigner = VEC_ALIGNER(srcp);
1164 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1165 width -= extrawidth;
1166 while (width) {
1167 vector unsigned char voverflow;
1168 vector unsigned char vd;
1169 vector unsigned char valpha;
1170 vector unsigned char vdstalpha;
1171 /* s = *srcp */
1172 voverflow = (vector unsigned char)vec_ld(15, srcp);
1173 vs = vec_perm(vs, voverflow, valigner);
1174 vs = vec_perm(vs, v0, vsrcPermute);
1175
1176 valpha = vec_perm(vs, v0, valphaPermute);
1177
1178 /* d = *dstp */
1179 vd = (vector unsigned char)vec_ld(0, dstp);
1180 vd = vec_perm(vd, v0, vsdstPermute);
1181 vdstalpha = vec_and(vd, valphamask);
1182
1183 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1184
1185 /* set the alpha to the dest alpha */
1186 vd = vec_and(vd, vpixelmask);
1187 vd = vec_or(vd, vdstalpha);
1188 vd = vec_perm(vd, v0, vdstPermute);
1189
1190 /* *dstp = res */
1191 vec_st((vector unsigned int)vd, 0, dstp);
1192
1193 srcp += 4;
1194 dstp += 4;
1195 width -= 4;
1196 vs = voverflow;
1197
1198 }
1199 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1200 }
1201 srcp += srcskip;
1202 dstp += dstskip;
1203#undef ONE_PIXEL_BLEND
1204 }
1205}
1206
1207/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1208static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1209{
1210 int width = info->d_width;
1211 int height = info->d_height;
1212 Uint32 *srcp = (Uint32 *)info->s_pixels;
1213 int srcskip = info->s_skip >> 2;
1214 Uint32 *dstp = (Uint32 *)info->d_pixels;
1215 int dstskip = info->d_skip >> 2;
1216 vector unsigned char mergePermute;
1217 vector unsigned char valphaPermute;
1218 vector unsigned char valphamask;
1219 vector unsigned char vpixelmask;
1220 vector unsigned char v0;
1221 vector unsigned short v1;
1222 vector unsigned short v8;
1223 v0 = vec_splat_u8(0);
1224 v1 = vec_splat_u16(1);
1225 v8 = vec_splat_u16(8);
1226 mergePermute = VEC_MERGE_PERMUTE();
1227 valphamask = VEC_ALPHA_MASK();
1228 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1229
1230
1231 vpixelmask = vec_nor(valphamask, v0);
1232 while(height--) {
1233 width = info->d_width;
1234#define ONE_PIXEL_BLEND(condition, widthvar) \
1235 while ((condition)) { \
1236 Uint32 dalpha; \
1237 Uint32 d; \
1238 Uint32 s1; \
1239 Uint32 d1; \
1240 Uint32 s = *srcp; \
1241 Uint32 alpha = s >> 24; \
1242 if(alpha) { \
1243 if(alpha == SDL_ALPHA_OPAQUE) { \
1244 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1245 } else { \
1246 d = *dstp; \
1247 dalpha = d & 0xff000000; \
1248 s1 = s & 0xff00ff; \
1249 d1 = d & 0xff00ff; \
1250 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1251 s &= 0xff00; \
1252 d &= 0xff00; \
1253 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1254 *dstp = d1 | d | dalpha; \
1255 } \
1256 } \
1257 ++srcp; \
1258 ++dstp; \
1259 widthvar--; \
1260 }
1261 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1262 if (width > 0) {
1263 int extrawidth = (width % 4);
1264 vector unsigned char valigner = VEC_ALIGNER(srcp);
1265 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1266 width -= extrawidth;
1267 while (width) {
1268 vector unsigned char voverflow;
1269 vector unsigned char vd;
1270 vector unsigned char valpha;
1271 vector unsigned char vdstalpha;
1272 /* s = *srcp */
1273 voverflow = (vector unsigned char)vec_ld(15, srcp);
1274 vs = vec_perm(vs, voverflow, valigner);
1275
1276 valpha = vec_perm(vs, v0, valphaPermute);
1277
1278 /* d = *dstp */
1279 vd = (vector unsigned char)vec_ld(0, dstp);
1280 vdstalpha = vec_and(vd, valphamask);
1281
1282 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1283
1284 /* set the alpha to the dest alpha */
1285 vd = vec_and(vd, vpixelmask);
1286 vd = vec_or(vd, vdstalpha);
1287
1288 /* *dstp = res */
1289 vec_st((vector unsigned int)vd, 0, dstp);
1290
1291 srcp += 4;
1292 dstp += 4;
1293 width -= 4;
1294 vs = voverflow;
1295 }
1296 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1297 }
1298 srcp += srcskip;
1299 dstp += dstskip;
1300 }
1301#undef ONE_PIXEL_BLEND
1302}
1303
1304static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1305{
1306 /* XXX : 6 */
1307 unsigned alpha = info->src->alpha;
1308 int height = info->d_height;
1309 Uint32 *srcp = (Uint32 *)info->s_pixels;
1310 int srcskip = info->s_skip >> 2;
1311 Uint32 *dstp = (Uint32 *)info->d_pixels;
1312 int dstskip = info->d_skip >> 2;
1313 SDL_PixelFormat *srcfmt = info->src;
1314 SDL_PixelFormat *dstfmt = info->dst;
1315 unsigned sA = srcfmt->alpha;
1316 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1317 vector unsigned char mergePermute;
1318 vector unsigned char vsrcPermute;
1319 vector unsigned char vdstPermute;
1320 vector unsigned char vsdstPermute;
1321 vector unsigned char valpha;
1322 vector unsigned char valphamask;
1323 vector unsigned char vbits;
1324 vector unsigned short v1;
1325 vector unsigned short v8;
1326
1327 mergePermute = VEC_MERGE_PERMUTE();
1328 v1 = vec_splat_u16(1);
1329 v8 = vec_splat_u16(8);
1330
1331 /* set the alpha to 255 on the destination surf */
1332 valphamask = VEC_ALPHA_MASK();
1333
1334 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1335 vdstPermute = calc_swizzle32(NULL, dstfmt);
1336 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1337
1338 /* set a vector full of alpha and 255-alpha */
1339 ((unsigned char *)&valpha)[0] = alpha;
1340 valpha = vec_splat(valpha, 0);
1341 vbits = (vector unsigned char)vec_splat_s8(-1);
1342
1343 while(height--) {
1344 int width = info->d_width;
1345#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1346 Uint32 Pixel; \
1347 unsigned sR, sG, sB, dR, dG, dB; \
1348 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1349 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1350 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1351 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1352 ++srcp; \
1353 ++dstp; \
1354 widthvar--; \
1355 }
1356 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1357 if (width > 0) {
1358 int extrawidth = (width % 4);
1359 vector unsigned char valigner = VEC_ALIGNER(srcp);
1360 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1361 width -= extrawidth;
1362 while (width) {
1363 vector unsigned char voverflow;
1364 vector unsigned char vd;
1365
1366 /* s = *srcp */
1367 voverflow = (vector unsigned char)vec_ld(15, srcp);
1368 vs = vec_perm(vs, voverflow, valigner);
1369 vs = vec_perm(vs, valpha, vsrcPermute);
1370
1371 /* d = *dstp */
1372 vd = (vector unsigned char)vec_ld(0, dstp);
1373 vd = vec_perm(vd, vd, vsdstPermute);
1374
1375 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1376
1377 /* set the alpha channel to full on */
1378 vd = vec_or(vd, valphamask);
1379 vd = vec_perm(vd, vbits, vdstPermute);
1380
1381 /* *dstp = res */
1382 vec_st((vector unsigned int)vd, 0, dstp);
1383
1384 srcp += 4;
1385 dstp += 4;
1386 width -= 4;
1387 vs = voverflow;
1388 }
1389 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1390 }
1391#undef ONE_PIXEL_BLEND
1392
1393 srcp += srcskip;
1394 dstp += dstskip;
1395 }
1396
1397}
1398
1399
1400/* fast RGB888->(A)RGB888 blending */
1401static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1402{
1403 unsigned alpha = info->src->alpha;
1404 int height = info->d_height;
1405 Uint32 *srcp = (Uint32 *)info->s_pixels;
1406 int srcskip = info->s_skip >> 2;
1407 Uint32 *dstp = (Uint32 *)info->d_pixels;
1408 int dstskip = info->d_skip >> 2;
1409 vector unsigned char mergePermute;
1410 vector unsigned char valpha;
1411 vector unsigned char valphamask;
1412 vector unsigned short v1;
1413 vector unsigned short v8;
1414
1415 mergePermute = VEC_MERGE_PERMUTE();
1416 v1 = vec_splat_u16(1);
1417 v8 = vec_splat_u16(8);
1418
1419 /* set the alpha to 255 on the destination surf */
1420 valphamask = VEC_ALPHA_MASK();
1421
1422 /* set a vector full of alpha and 255-alpha */
1423 ((unsigned char *)&valpha)[0] = alpha;
1424 valpha = vec_splat(valpha, 0);
1425
1426 while(height--) {
1427 int width = info->d_width;
1428#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1429 Uint32 s = *srcp; \
1430 Uint32 d = *dstp; \
1431 Uint32 s1 = s & 0xff00ff; \
1432 Uint32 d1 = d & 0xff00ff; \
1433 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1434 & 0xff00ff; \
1435 s &= 0xff00; \
1436 d &= 0xff00; \
1437 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1438 *dstp = d1 | d | 0xff000000; \
1439 ++srcp; \
1440 ++dstp; \
1441 widthvar--; \
1442 }
1443 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1444 if (width > 0) {
1445 int extrawidth = (width % 4);
1446 vector unsigned char valigner = VEC_ALIGNER(srcp);
1447 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1448 width -= extrawidth;
1449 while (width) {
1450 vector unsigned char voverflow;
1451 vector unsigned char vd;
1452
1453 /* s = *srcp */
1454 voverflow = (vector unsigned char)vec_ld(15, srcp);
1455 vs = vec_perm(vs, voverflow, valigner);
1456
1457 /* d = *dstp */
1458 vd = (vector unsigned char)vec_ld(0, dstp);
1459
1460 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1461
1462 /* set the alpha channel to full on */
1463 vd = vec_or(vd, valphamask);
1464
1465 /* *dstp = res */
1466 vec_st((vector unsigned int)vd, 0, dstp);
1467
1468 srcp += 4;
1469 dstp += 4;
1470 width -= 4;
1471 vs = voverflow;
1472 }
1473 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1474 }
1475#undef ONE_PIXEL_BLEND
1476
1477 srcp += srcskip;
1478 dstp += dstskip;
1479 }
1480}
1481#if __MWERKS__
1482#pragma altivec_model off
1483#endif
1484#endif /* SDL_ALTIVEC_BLITTERS */
1485
1486/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1487static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1488{
1489 int width = info->d_width;
1490 int height = info->d_height;
1491 Uint32 *srcp = (Uint32 *)info->s_pixels;
1492 int srcskip = info->s_skip >> 2;
1493 Uint32 *dstp = (Uint32 *)info->d_pixels;
1494 int dstskip = info->d_skip >> 2;
1495
1496 while(height--) {
1497 DUFFS_LOOP4({
1498 Uint32 s = *srcp++;
1499 Uint32 d = *dstp;
1500 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1501 + (s & d & 0x00010101)) | 0xff000000;
1502 }, width);
1503 srcp += srcskip;
1504 dstp += dstskip;
1505 }
1506}
1507
1508/* fast RGB888->(A)RGB888 blending with surface alpha */
1509static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1510{
1511 unsigned alpha = info->src->alpha;
1512 if(alpha == 128) {
1513 BlitRGBtoRGBSurfaceAlpha128(info);
1514 } else {
1515 int width = info->d_width;
1516 int height = info->d_height;
1517 Uint32 *srcp = (Uint32 *)info->s_pixels;
1518 int srcskip = info->s_skip >> 2;
1519 Uint32 *dstp = (Uint32 *)info->d_pixels;
1520 int dstskip = info->d_skip >> 2;
1521 Uint32 s;
1522 Uint32 d;
1523 Uint32 s1;
1524 Uint32 d1;
1525
1526 while(height--) {
1527 DUFFS_LOOP_DOUBLE2({
1528 /* One Pixel Blend */
1529 s = *srcp;
1530 d = *dstp;
1531 s1 = s & 0xff00ff;
1532 d1 = d & 0xff00ff;
1533 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1534 & 0xff00ff;
1535 s &= 0xff00;
1536 d &= 0xff00;
1537 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1538 *dstp = d1 | d | 0xff000000;
1539 ++srcp;
1540 ++dstp;
1541 },{
1542 /* Two Pixels Blend */
1543 s = *srcp;
1544 d = *dstp;
1545 s1 = s & 0xff00ff;
1546 d1 = d & 0xff00ff;
1547 d1 += (s1 - d1) * alpha >> 8;
1548 d1 &= 0xff00ff;
1549
1550 s = ((s & 0xff00) >> 8) |
1551 ((srcp[1] & 0xff00) << 8);
1552 d = ((d & 0xff00) >> 8) |
1553 ((dstp[1] & 0xff00) << 8);
1554 d += (s - d) * alpha >> 8;
1555 d &= 0x00ff00ff;
1556
1557 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1558 ++srcp;
1559
1560 s1 = *srcp;
1561 d1 = *dstp;
1562 s1 &= 0xff00ff;
1563 d1 &= 0xff00ff;
1564 d1 += (s1 - d1) * alpha >> 8;
1565 d1 &= 0xff00ff;
1566
1567 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1568 ++srcp;
1569 ++dstp;
1570 }, width);
1571 srcp += srcskip;
1572 dstp += dstskip;
1573 }
1574 }
1575}
1576
1577/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1578static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1579{
1580 int width = info->d_width;
1581 int height = info->d_height;
1582 Uint32 *srcp = (Uint32 *)info->s_pixels;
1583 int srcskip = info->s_skip >> 2;
1584 Uint32 *dstp = (Uint32 *)info->d_pixels;
1585 int dstskip = info->d_skip >> 2;
1586
1587 while(height--) {
1588 DUFFS_LOOP4({
1589 Uint32 dalpha;
1590 Uint32 d;
1591 Uint32 s1;
1592 Uint32 d1;
1593 Uint32 s = *srcp;
1594 Uint32 alpha = s >> 24;
1595 /* FIXME: Here we special-case opaque alpha since the
1596 compositioning used (>>8 instead of /255) doesn't handle
1597 it correctly. Also special-case alpha=0 for speed?
1598 Benchmark this! */
1599 if(alpha) {
1600 if(alpha == SDL_ALPHA_OPAQUE) {
1601 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1602 } else {
1603 /*
1604 * take out the middle component (green), and process
1605 * the other two in parallel. One multiply less.
1606 */
1607 d = *dstp;
1608 dalpha = d & 0xff000000;
1609 s1 = s & 0xff00ff;
1610 d1 = d & 0xff00ff;
1611 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1612 s &= 0xff00;
1613 d &= 0xff00;
1614 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1615 *dstp = d1 | d | dalpha;
1616 }
1617 }
1618 ++srcp;
1619 ++dstp;
1620 }, width);
1621 srcp += srcskip;
1622 dstp += dstskip;
1623 }
1624}
1625
1626#if GCC_ASMBLIT
1627/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1628static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1629{
1630 int width = info->d_width;
1631 int height = info->d_height;
1632 Uint32 *srcp = (Uint32 *)info->s_pixels;
1633 int srcskip = info->s_skip >> 2;
1634 Uint32 *dstp = (Uint32 *)info->d_pixels;
1635 int dstskip = info->d_skip >> 2;
1636 SDL_PixelFormat* sf = info->src;
1637 Uint32 amask = sf->Amask;
1638
1639 __asm__ (
1640 /* make mm6 all zeros. */
1641 "pxor %%mm6, %%mm6\n"
1642
1643 /* Make a mask to preserve the alpha. */
1644 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1645 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1646 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1647 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1648 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1649
1650 /* form channel masks */
1651 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1652 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1653 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1654 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1655
1656 /* get alpha channel shift */
1657 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1658
1659 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1660
1661 while(height--) {
1662
1663 DUFFS_LOOP4({
1664 Uint32 alpha;
1665
1666 __asm__ (
1667 "prefetch 64(%0)\n"
1668 "prefetch 64(%1)\n"
1669 : : "r" (srcp), "r" (dstp) );
1670
1671 alpha = *srcp & amask;
1672 /* FIXME: Here we special-case opaque alpha since the
1673 compositioning used (>>8 instead of /255) doesn't handle
1674 it correctly. Also special-case alpha=0 for speed?
1675 Benchmark this! */
1676 if(alpha == 0) {
1677 /* do nothing */
1678 }
1679 else if(alpha == amask) {
1680 /* opaque alpha -- copy RGB, keep dst alpha */
1681 /* using MMX here to free up regular registers for other things */
1682 __asm__ (
1683 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1684 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1685 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1686 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1687 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1688 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1689
1690 : : "r" (srcp), "r" (dstp) );
1691 }
1692
1693 else {
1694 __asm__ (
1695 /* load in the source, and dst. */
1696 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1697 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1698
1699 /* Move the src alpha into mm2 */
1700
1701 /* if supporting pshufw */
1702 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1703 /*"psrlw $8, %%mm2\n" */
1704
1705 /* else: */
1706 "movd %2, %%mm2\n"
1707 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1708 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1709 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1710 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1711
1712 /* move the colors into words. */
1713 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1714 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1715
1716 /* src - dst */
1717 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1718
1719 /* A * (src-dst) */
1720 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1721 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1722 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1723
1724 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1725
1726 "movd %%mm0, (%1)\n" /* result in mm0 */
1727
1728 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1729
1730 }
1731 ++srcp;
1732 ++dstp;
1733 }, width);
1734 srcp += srcskip;
1735 dstp += dstskip;
1736 }
1737
1738 __asm__ (
1739 "emms\n"
1740 : );
1741}
1742/* End GCC_ASMBLIT*/
1743
1744#elif MSVC_ASMBLIT
1745/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1746static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1747{
1748 int width = info->d_width;
1749 int height = info->d_height;
1750 Uint32 *srcp = (Uint32 *)info->s_pixels;
1751 int srcskip = info->s_skip >> 2;
1752 Uint32 *dstp = (Uint32 *)info->d_pixels;
1753 int dstskip = info->d_skip >> 2;
1754 SDL_PixelFormat* sf = info->src;
1755 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1756 Uint32 amask = sf->Amask;
1757 Uint32 ashift = sf->Ashift;
1758 Uint64 multmask;
1759
1760 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1761
1762 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1763 multmask = ~(0xFFFFi64 << (ashift * 2));
1764 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1765
1766 while(height--) {
1767 DUFFS_LOOP4({
1768 Uint32 alpha;
1769
1770 _m_prefetch(srcp + 16);
1771 _m_prefetch(dstp + 16);
1772
1773 alpha = *srcp & amask;
1774 if (alpha == 0) {
1775 /* do nothing */
1776 } else if (alpha == amask) {
1777 /* copy RGB, keep dst alpha */
1778 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1779 } else {
1780 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1781 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1782
1783 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1784 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1785
1786 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1787 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1788 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1789 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1790 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1791
1792 /* blend */
1793 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1794 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1795 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1796 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1797 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1798
1799 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1800 }
1801 ++srcp;
1802 ++dstp;
1803 }, width);
1804 srcp += srcskip;
1805 dstp += dstskip;
1806 }
1807 _mm_empty();
1808}
1809/* End MSVC_ASMBLIT */
1810
1811#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1812
1813/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1814
1815/* blend a single 16 bit pixel at 50% */
1816#define BLEND16_50(d, s, mask) \
1817 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1818
1819/* blend two 16 bit pixels at 50% */
1820#define BLEND2x16_50(d, s, mask) \
1821 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1822 + (s & d & (~(mask | mask << 16))))
1823
1824static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1825{
1826 int width = info->d_width;
1827 int height = info->d_height;
1828 Uint16 *srcp = (Uint16 *)info->s_pixels;
1829 int srcskip = info->s_skip >> 1;
1830 Uint16 *dstp = (Uint16 *)info->d_pixels;
1831 int dstskip = info->d_skip >> 1;
1832
1833 while(height--) {
1834 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1835 /*
1836 * Source and destination not aligned, pipeline it.
1837 * This is mostly a win for big blits but no loss for
1838 * small ones
1839 */
1840 Uint32 prev_sw;
1841 int w = width;
1842
1843 /* handle odd destination */
1844 if((uintptr_t)dstp & 2) {
1845 Uint16 d = *dstp, s = *srcp;
1846 *dstp = BLEND16_50(d, s, mask);
1847 dstp++;
1848 srcp++;
1849 w--;
1850 }
1851 srcp++; /* srcp is now 32-bit aligned */
1852
1853 /* bootstrap pipeline with first halfword */
1854 prev_sw = ((Uint32 *)srcp)[-1];
1855
1856 while(w > 1) {
1857 Uint32 sw, dw, s;
1858 sw = *(Uint32 *)srcp;
1859 dw = *(Uint32 *)dstp;
1860#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1861 s = (prev_sw << 16) + (sw >> 16);
1862#else
1863 s = (prev_sw >> 16) + (sw << 16);
1864#endif
1865 prev_sw = sw;
1866 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1867 dstp += 2;
1868 srcp += 2;
1869 w -= 2;
1870 }
1871
1872 /* final pixel if any */
1873 if(w) {
1874 Uint16 d = *dstp, s;
1875#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1876 s = (Uint16)prev_sw;
1877#else
1878 s = (Uint16)(prev_sw >> 16);
1879#endif
1880 *dstp = BLEND16_50(d, s, mask);
1881 srcp++;
1882 dstp++;
1883 }
1884 srcp += srcskip - 1;
1885 dstp += dstskip;
1886 } else {
1887 /* source and destination are aligned */
1888 int w = width;
1889
1890 /* first odd pixel? */
1891 if((uintptr_t)srcp & 2) {
1892 Uint16 d = *dstp, s = *srcp;
1893 *dstp = BLEND16_50(d, s, mask);
1894 srcp++;
1895 dstp++;
1896 w--;
1897 }
1898 /* srcp and dstp are now 32-bit aligned */
1899
1900 while(w > 1) {
1901 Uint32 sw = *(Uint32 *)srcp;
1902 Uint32 dw = *(Uint32 *)dstp;
1903 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1904 srcp += 2;
1905 dstp += 2;
1906 w -= 2;
1907 }
1908
1909 /* last odd pixel? */
1910 if(w) {
1911 Uint16 d = *dstp, s = *srcp;
1912 *dstp = BLEND16_50(d, s, mask);
1913 srcp++;
1914 dstp++;
1915 }
1916 srcp += srcskip;
1917 dstp += dstskip;
1918 }
1919 }
1920}
1921
1922#if GCC_ASMBLIT
1923/* fast RGB565->RGB565 blending with surface alpha */
1924static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1925{
1926 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1927 if(alpha == 128) {
1928 Blit16to16SurfaceAlpha128(info, 0xf7de);
1929 } else {
1930 int width = info->d_width;
1931 int height = info->d_height;
1932 Uint16 *srcp = (Uint16 *)info->s_pixels;
1933 int srcskip = info->s_skip >> 1;
1934 Uint16 *dstp = (Uint16 *)info->d_pixels;
1935 int dstskip = info->d_skip >> 1;
1936 Uint32 s, d;
1937 Uint64 load;
1938
1939 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1940 load = alpha;
1941 alpha >>= 3; /* downscale alpha to 5 bits */
1942
1943 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1944 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1945 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1946 /* position alpha to allow for mullo and mulhi on diff channels
1947 to reduce the number of operations */
1948 psllq_i2r(3, mm0);
1949
1950 /* Setup the 565 color channel masks */
1951 load = 0x07E007E007E007E0ULL;
1952 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1953 load = 0x001F001F001F001FULL;
1954 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1955 while(height--) {
1956 DUFFS_LOOP_QUATRO2(
1957 {
1958 s = *srcp++;
1959 d = *dstp;
1960 /*
1961 * shift out the middle component (green) to
1962 * the high 16 bits, and process all three RGB
1963 * components at the same time.
1964 */
1965 s = (s | s << 16) & 0x07e0f81f;
1966 d = (d | d << 16) & 0x07e0f81f;
1967 d += (s - d) * alpha >> 5;
1968 d &= 0x07e0f81f;
1969 *dstp++ = d | d >> 16;
1970 },{
1971 s = *srcp++;
1972 d = *dstp;
1973 /*
1974 * shift out the middle component (green) to
1975 * the high 16 bits, and process all three RGB
1976 * components at the same time.
1977 */
1978 s = (s | s << 16) & 0x07e0f81f;
1979 d = (d | d << 16) & 0x07e0f81f;
1980 d += (s - d) * alpha >> 5;
1981 d &= 0x07e0f81f;
1982 *dstp++ = d | d >> 16;
1983 s = *srcp++;
1984 d = *dstp;
1985 /*
1986 * shift out the middle component (green) to
1987 * the high 16 bits, and process all three RGB
1988 * components at the same time.
1989 */
1990 s = (s | s << 16) & 0x07e0f81f;
1991 d = (d | d << 16) & 0x07e0f81f;
1992 d += (s - d) * alpha >> 5;
1993 d &= 0x07e0f81f;
1994 *dstp++ = d | d >> 16;
1995 },{
1996 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1997 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1998
1999 /* red -- does not need a mask since the right shift clears
2000 the uninteresting bits */
2001 movq_r2r(mm2, mm5); /* src -> mm5 */
2002 movq_r2r(mm3, mm6); /* dst -> mm6 */
2003 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2004 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2005
2006 /* blend */
2007 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2008 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2009 /* alpha used is actually 11 bits
2010 11 + 5 = 16 bits, so the sign bits are lost */
2011 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2012 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2013 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2014
2015 movq_r2r(mm6, mm1); /* save new reds in dsts */
2016
2017 /* green -- process the bits in place */
2018 movq_r2r(mm2, mm5); /* src -> mm5 */
2019 movq_r2r(mm3, mm6); /* dst -> mm6 */
2020 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2021 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2022
2023 /* blend */
2024 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2025 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2026 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2027 bits are gone and the sign bits present */
2028 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2029 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2030
2031 por_r2r(mm6, mm1); /* save new greens in dsts */
2032
2033 /* blue */
2034 movq_r2r(mm2, mm5); /* src -> mm5 */
2035 movq_r2r(mm3, mm6); /* dst -> mm6 */
2036 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2037 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2038
2039 /* blend */
2040 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2041 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2042 /* 11 + 5 = 16 bits, so the sign bits are lost and
2043 the interesting bits will need to be MASKed */
2044 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2045 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2046 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2047
2048 por_r2r(mm6, mm1); /* save new blues in dsts */
2049
2050 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2051
2052 srcp += 4;
2053 dstp += 4;
2054 }, width);
2055 srcp += srcskip;
2056 dstp += dstskip;
2057 }
2058 emms();
2059 }
2060}
2061
2062/* fast RGB555->RGB555 blending with surface alpha */
2063static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2064{
2065 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2066 if(alpha == 128) {
2067 Blit16to16SurfaceAlpha128(info, 0xfbde);
2068 } else {
2069 int width = info->d_width;
2070 int height = info->d_height;
2071 Uint16 *srcp = (Uint16 *)info->s_pixels;
2072 int srcskip = info->s_skip >> 1;
2073 Uint16 *dstp = (Uint16 *)info->d_pixels;
2074 int dstskip = info->d_skip >> 1;
2075 Uint32 s, d;
2076 Uint64 load;
2077
2078 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2079 load = alpha;
2080 alpha >>= 3; /* downscale alpha to 5 bits */
2081
2082 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2083 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2084 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2085 /* position alpha to allow for mullo and mulhi on diff channels
2086 to reduce the number of operations */
2087 psllq_i2r(3, mm0);
2088
2089 /* Setup the 555 color channel masks */
2090 load = 0x03E003E003E003E0ULL;
2091 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2092 load = 0x001F001F001F001FULL;
2093 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2094 while(height--) {
2095 DUFFS_LOOP_QUATRO2(
2096 {
2097 s = *srcp++;
2098 d = *dstp;
2099 /*
2100 * shift out the middle component (green) to
2101 * the high 16 bits, and process all three RGB
2102 * components at the same time.
2103 */
2104 s = (s | s << 16) & 0x03e07c1f;
2105 d = (d | d << 16) & 0x03e07c1f;
2106 d += (s - d) * alpha >> 5;
2107 d &= 0x03e07c1f;
2108 *dstp++ = d | d >> 16;
2109 },{
2110 s = *srcp++;
2111 d = *dstp;
2112 /*
2113 * shift out the middle component (green) to
2114 * the high 16 bits, and process all three RGB
2115 * components at the same time.
2116 */
2117 s = (s | s << 16) & 0x03e07c1f;
2118 d = (d | d << 16) & 0x03e07c1f;
2119 d += (s - d) * alpha >> 5;
2120 d &= 0x03e07c1f;
2121 *dstp++ = d | d >> 16;
2122 s = *srcp++;
2123 d = *dstp;
2124 /*
2125 * shift out the middle component (green) to
2126 * the high 16 bits, and process all three RGB
2127 * components at the same time.
2128 */
2129 s = (s | s << 16) & 0x03e07c1f;
2130 d = (d | d << 16) & 0x03e07c1f;
2131 d += (s - d) * alpha >> 5;
2132 d &= 0x03e07c1f;
2133 *dstp++ = d | d >> 16;
2134 },{
2135 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2136 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2137
2138 /* red -- process the bits in place */
2139 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2140 /* by reusing the GREEN mask we free up another mmx
2141 register to accumulate the result */
2142
2143 movq_r2r(mm2, mm5); /* src -> mm5 */
2144 movq_r2r(mm3, mm6); /* dst -> mm6 */
2145 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2146 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2147
2148 /* blend */
2149 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2150 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2151 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2152 cleared by a MASK below */
2153 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2154 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2155 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2156
2157 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2158
2159 movq_r2r(mm6, mm1); /* save new reds in dsts */
2160
2161 /* green -- process the bits in place */
2162 movq_r2r(mm2, mm5); /* src -> mm5 */
2163 movq_r2r(mm3, mm6); /* dst -> mm6 */
2164 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2165 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2166
2167 /* blend */
2168 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2169 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2170 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2171 bits are gone and the sign bits present */
2172 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2173 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2174
2175 por_r2r(mm6, mm1); /* save new greens in dsts */
2176
2177 /* blue */
2178 movq_r2r(mm2, mm5); /* src -> mm5 */
2179 movq_r2r(mm3, mm6); /* dst -> mm6 */
2180 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2181 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2182
2183 /* blend */
2184 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2185 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2186 /* 11 + 5 = 16 bits, so the sign bits are lost and
2187 the interesting bits will need to be MASKed */
2188 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2189 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2190 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2191
2192 por_r2r(mm6, mm1); /* save new blues in dsts */
2193
2194 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2195
2196 srcp += 4;
2197 dstp += 4;
2198 }, width);
2199 srcp += srcskip;
2200 dstp += dstskip;
2201 }
2202 emms();
2203 }
2204}
2205/* End GCC_ASMBLIT */
2206
2207#elif MSVC_ASMBLIT
2208/* fast RGB565->RGB565 blending with surface alpha */
2209static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2210{
2211 unsigned alpha = info->src->alpha;
2212 if(alpha == 128) {
2213 Blit16to16SurfaceAlpha128(info, 0xf7de);
2214 } else {
2215 int width = info->d_width;
2216 int height = info->d_height;
2217 Uint16 *srcp = (Uint16 *)info->s_pixels;
2218 int srcskip = info->s_skip >> 1;
2219 Uint16 *dstp = (Uint16 *)info->d_pixels;
2220 int dstskip = info->d_skip >> 1;
2221 Uint32 s, d;
2222
2223 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2224
2225 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2226 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2227 alpha >>= 3; /* downscale alpha to 5 bits */
2228
2229 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2230 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2231 /* position alpha to allow for mullo and mulhi on diff channels
2232 to reduce the number of operations */
2233 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2234
2235 /* Setup the 565 color channel masks */
2236 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2237 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2238
2239 while(height--) {
2240 DUFFS_LOOP_QUATRO2(
2241 {
2242 s = *srcp++;
2243 d = *dstp;
2244 /*
2245 * shift out the middle component (green) to
2246 * the high 16 bits, and process all three RGB
2247 * components at the same time.
2248 */
2249 s = (s | s << 16) & 0x07e0f81f;
2250 d = (d | d << 16) & 0x07e0f81f;
2251 d += (s - d) * alpha >> 5;
2252 d &= 0x07e0f81f;
2253 *dstp++ = (Uint16)(d | d >> 16);
2254 },{
2255 s = *srcp++;
2256 d = *dstp;
2257 /*
2258 * shift out the middle component (green) to
2259 * the high 16 bits, and process all three RGB
2260 * components at the same time.
2261 */
2262 s = (s | s << 16) & 0x07e0f81f;
2263 d = (d | d << 16) & 0x07e0f81f;
2264 d += (s - d) * alpha >> 5;
2265 d &= 0x07e0f81f;
2266 *dstp++ = (Uint16)(d | d >> 16);
2267 s = *srcp++;
2268 d = *dstp;
2269 /*
2270 * shift out the middle component (green) to
2271 * the high 16 bits, and process all three RGB
2272 * components at the same time.
2273 */
2274 s = (s | s << 16) & 0x07e0f81f;
2275 d = (d | d << 16) & 0x07e0f81f;
2276 d += (s - d) * alpha >> 5;
2277 d &= 0x07e0f81f;
2278 *dstp++ = (Uint16)(d | d >> 16);
2279 },{
2280 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2281 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2282
2283 /* red */
2284 src2 = src1;
2285 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2286
2287 dst2 = dst1;
2288 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2289
2290 /* blend */
2291 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2292 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2293 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2294 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2295 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2296
2297 mm_res = dst2; /* RED -> mm_res */
2298
2299 /* green -- process the bits in place */
2300 src2 = src1;
2301 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2302
2303 dst2 = dst1;
2304 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2305
2306 /* blend */
2307 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2308 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2309 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2310 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2311
2312 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2313
2314 /* blue */
2315 src2 = src1;
2316 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2317
2318 dst2 = dst1;
2319 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2320
2321 /* blend */
2322 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2323 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2324 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2325 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2326 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2327
2328 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2329
2330 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2331
2332 srcp += 4;
2333 dstp += 4;
2334 }, width);
2335 srcp += srcskip;
2336 dstp += dstskip;
2337 }
2338 _mm_empty();
2339 }
2340}
2341
2342/* fast RGB555->RGB555 blending with surface alpha */
2343static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2344{
2345 unsigned alpha = info->src->alpha;
2346 if(alpha == 128) {
2347 Blit16to16SurfaceAlpha128(info, 0xfbde);
2348 } else {
2349 int width = info->d_width;
2350 int height = info->d_height;
2351 Uint16 *srcp = (Uint16 *)info->s_pixels;
2352 int srcskip = info->s_skip >> 1;
2353 Uint16 *dstp = (Uint16 *)info->d_pixels;
2354 int dstskip = info->d_skip >> 1;
2355 Uint32 s, d;
2356
2357 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2358
2359 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2360 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2361 alpha >>= 3; /* downscale alpha to 5 bits */
2362
2363 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2364 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2365 /* position alpha to allow for mullo and mulhi on diff channels
2366 to reduce the number of operations */
2367 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2368
2369 /* Setup the 555 color channel masks */
2370 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2371 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2372 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2373
2374 while(height--) {
2375 DUFFS_LOOP_QUATRO2(
2376 {
2377 s = *srcp++;
2378 d = *dstp;
2379 /*
2380 * shift out the middle component (green) to
2381 * the high 16 bits, and process all three RGB
2382 * components at the same time.
2383 */
2384 s = (s | s << 16) & 0x03e07c1f;
2385 d = (d | d << 16) & 0x03e07c1f;
2386 d += (s - d) * alpha >> 5;
2387 d &= 0x03e07c1f;
2388 *dstp++ = (Uint16)(d | d >> 16);
2389 },{
2390 s = *srcp++;
2391 d = *dstp;
2392 /*
2393 * shift out the middle component (green) to
2394 * the high 16 bits, and process all three RGB
2395 * components at the same time.
2396 */
2397 s = (s | s << 16) & 0x03e07c1f;
2398 d = (d | d << 16) & 0x03e07c1f;
2399 d += (s - d) * alpha >> 5;
2400 d &= 0x03e07c1f;
2401 *dstp++ = (Uint16)(d | d >> 16);
2402 s = *srcp++;
2403 d = *dstp;
2404 /*
2405 * shift out the middle component (green) to
2406 * the high 16 bits, and process all three RGB
2407 * components at the same time.
2408 */
2409 s = (s | s << 16) & 0x03e07c1f;
2410 d = (d | d << 16) & 0x03e07c1f;
2411 d += (s - d) * alpha >> 5;
2412 d &= 0x03e07c1f;
2413 *dstp++ = (Uint16)(d | d >> 16);
2414 },{
2415 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2416 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2417
2418 /* red -- process the bits in place */
2419 src2 = src1;
2420 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2421
2422 dst2 = dst1;
2423 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2424
2425 /* blend */
2426 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2427 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2428 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2429 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2430 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2431
2432 mm_res = dst2; /* RED -> mm_res */
2433
2434 /* green -- process the bits in place */
2435 src2 = src1;
2436 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2437
2438 dst2 = dst1;
2439 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2440
2441 /* blend */
2442 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2443 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2444 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2445 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2446
2447 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2448
2449 /* blue */
2450 src2 = src1; /* src -> src2 */
2451 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2452
2453 dst2 = dst1; /* dst -> dst2 */
2454 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2455
2456 /* blend */
2457 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2458 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2459 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2460 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2461 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2462
2463 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2464
2465 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2466
2467 srcp += 4;
2468 dstp += 4;
2469 }, width);
2470 srcp += srcskip;
2471 dstp += dstskip;
2472 }
2473 _mm_empty();
2474 }
2475}
2476#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2477
2478/* fast RGB565->RGB565 blending with surface alpha */
2479static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2480{
2481 unsigned alpha = info->src->alpha;
2482 if(alpha == 128) {
2483 Blit16to16SurfaceAlpha128(info, 0xf7de);
2484 } else {
2485 int width = info->d_width;
2486 int height = info->d_height;
2487 Uint16 *srcp = (Uint16 *)info->s_pixels;
2488 int srcskip = info->s_skip >> 1;
2489 Uint16 *dstp = (Uint16 *)info->d_pixels;
2490 int dstskip = info->d_skip >> 1;
2491 alpha >>= 3; /* downscale alpha to 5 bits */
2492
2493 while(height--) {
2494 DUFFS_LOOP4({
2495 Uint32 s = *srcp++;
2496 Uint32 d = *dstp;
2497 /*
2498 * shift out the middle component (green) to
2499 * the high 16 bits, and process all three RGB
2500 * components at the same time.
2501 */
2502 s = (s | s << 16) & 0x07e0f81f;
2503 d = (d | d << 16) & 0x07e0f81f;
2504 d += (s - d) * alpha >> 5;
2505 d &= 0x07e0f81f;
2506 *dstp++ = (Uint16)(d | d >> 16);
2507 }, width);
2508 srcp += srcskip;
2509 dstp += dstskip;
2510 }
2511 }
2512}
2513
2514/* fast RGB555->RGB555 blending with surface alpha */
2515static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2516{
2517 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2518 if(alpha == 128) {
2519 Blit16to16SurfaceAlpha128(info, 0xfbde);
2520 } else {
2521 int width = info->d_width;
2522 int height = info->d_height;
2523 Uint16 *srcp = (Uint16 *)info->s_pixels;
2524 int srcskip = info->s_skip >> 1;
2525 Uint16 *dstp = (Uint16 *)info->d_pixels;
2526 int dstskip = info->d_skip >> 1;
2527 alpha >>= 3; /* downscale alpha to 5 bits */
2528
2529 while(height--) {
2530 DUFFS_LOOP4({
2531 Uint32 s = *srcp++;
2532 Uint32 d = *dstp;
2533 /*
2534 * shift out the middle component (green) to
2535 * the high 16 bits, and process all three RGB
2536 * components at the same time.
2537 */
2538 s = (s | s << 16) & 0x03e07c1f;
2539 d = (d | d << 16) & 0x03e07c1f;
2540 d += (s - d) * alpha >> 5;
2541 d &= 0x03e07c1f;
2542 *dstp++ = (Uint16)(d | d >> 16);
2543 }, width);
2544 srcp += srcskip;
2545 dstp += dstskip;
2546 }
2547 }
2548}
2549
2550/* fast ARGB8888->RGB565 blending with pixel alpha */
2551static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2552{
2553 int width = info->d_width;
2554 int height = info->d_height;
2555 Uint32 *srcp = (Uint32 *)info->s_pixels;
2556 int srcskip = info->s_skip >> 2;
2557 Uint16 *dstp = (Uint16 *)info->d_pixels;
2558 int dstskip = info->d_skip >> 1;
2559
2560 while(height--) {
2561 DUFFS_LOOP4({
2562 Uint32 s = *srcp;
2563 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2564 /* FIXME: Here we special-case opaque alpha since the
2565 compositioning used (>>8 instead of /255) doesn't handle
2566 it correctly. Also special-case alpha=0 for speed?
2567 Benchmark this! */
2568 if(alpha) {
2569 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2570 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2571 } else {
2572 Uint32 d = *dstp;
2573 /*
2574 * convert source and destination to G0RAB65565
2575 * and blend all components at the same time
2576 */
2577 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2578 + (s >> 3 & 0x1f);
2579 d = (d | d << 16) & 0x07e0f81f;
2580 d += (s - d) * alpha >> 5;
2581 d &= 0x07e0f81f;
2582 *dstp = (Uint16)(d | d >> 16);
2583 }
2584 }
2585 srcp++;
2586 dstp++;
2587 }, width);
2588 srcp += srcskip;
2589 dstp += dstskip;
2590 }
2591}
2592
2593/* fast ARGB8888->RGB555 blending with pixel alpha */
2594static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2595{
2596 int width = info->d_width;
2597 int height = info->d_height;
2598 Uint32 *srcp = (Uint32 *)info->s_pixels;
2599 int srcskip = info->s_skip >> 2;
2600 Uint16 *dstp = (Uint16 *)info->d_pixels;
2601 int dstskip = info->d_skip >> 1;
2602
2603 while(height--) {
2604 DUFFS_LOOP4({
2605 unsigned alpha;
2606 Uint32 s = *srcp;
2607 alpha = s >> 27; /* downscale alpha to 5 bits */
2608 /* FIXME: Here we special-case opaque alpha since the
2609 compositioning used (>>8 instead of /255) doesn't handle
2610 it correctly. Also special-case alpha=0 for speed?
2611 Benchmark this! */
2612 if(alpha) {
2613 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2614 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2615 } else {
2616 Uint32 d = *dstp;
2617 /*
2618 * convert source and destination to G0RAB65565
2619 * and blend all components at the same time
2620 */
2621 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2622 + (s >> 3 & 0x1f);
2623 d = (d | d << 16) & 0x03e07c1f;
2624 d += (s - d) * alpha >> 5;
2625 d &= 0x03e07c1f;
2626 *dstp = (Uint16)(d | d >> 16);
2627 }
2628 }
2629 srcp++;
2630 dstp++;
2631 }, width);
2632 srcp += srcskip;
2633 dstp += dstskip;
2634 }
2635}
2636
2637/* General (slow) N->N blending with per-surface alpha */
2638static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2639{
2640 int width = info->d_width;
2641 int height = info->d_height;
2642 Uint8 *src = info->s_pixels;
2643 int srcskip = info->s_skip;
2644 Uint8 *dst = info->d_pixels;
2645 int dstskip = info->d_skip;
2646 SDL_PixelFormat *srcfmt = info->src;
2647 SDL_PixelFormat *dstfmt = info->dst;
2648 int srcbpp = srcfmt->BytesPerPixel;
2649 int dstbpp = dstfmt->BytesPerPixel;
2650 unsigned sA = srcfmt->alpha;
2651 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2652
2653 if(sA) {
2654 while ( height-- ) {
2655 DUFFS_LOOP4(
2656 {
2657 Uint32 Pixel;
2658 unsigned sR;
2659 unsigned sG;
2660 unsigned sB;
2661 unsigned dR;
2662 unsigned dG;
2663 unsigned dB;
2664 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2665 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2666 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2667 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2668 src += srcbpp;
2669 dst += dstbpp;
2670 },
2671 width);
2672 src += srcskip;
2673 dst += dstskip;
2674 }
2675 }
2676}
2677
2678/* General (slow) colorkeyed N->N blending with per-surface alpha */
2679static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2680{
2681 int width = info->d_width;
2682 int height = info->d_height;
2683 Uint8 *src = info->s_pixels;
2684 int srcskip = info->s_skip;
2685 Uint8 *dst = info->d_pixels;
2686 int dstskip = info->d_skip;
2687 SDL_PixelFormat *srcfmt = info->src;
2688 SDL_PixelFormat *dstfmt = info->dst;
2689 Uint32 ckey = srcfmt->colorkey;
2690 int srcbpp = srcfmt->BytesPerPixel;
2691 int dstbpp = dstfmt->BytesPerPixel;
2692 unsigned sA = srcfmt->alpha;
2693 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2694
211e4bff 2695 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2696 Uint16 *src16 = (Uint16 *)src;
2697 Uint16 *dst16 = (Uint16 *)dst;
2698 sA >>= 3; /* downscale alpha to 5 bits */
2699 while ( height-- ) {
2700 DUFFS_LOOP4(
2701 {
2702 Uint32 s;
2703 Uint32 d;
2704 s = *src16;
2705 if(sA && s != ckey) {
2706 d = *dst16;
2707 s = (s | s << 16) & 0x07e0f81f;
2708 d = (d | d << 16) & 0x07e0f81f;
2709 d += (s - d) * sA >> 5;
2710 d &= 0x07e0f81f;
2711 *dst16 = (Uint16)(d | d >> 16);
2712 }
2713 src16++;
2714 dst16++;
2715 },
2716 width);
2717 src16 += srcskip / 2;
2718 dst16 += dstskip / 2;
2719 }
2720 return;
2721 }
2722
e14743d1 2723 while ( height-- ) {
2724 DUFFS_LOOP4(
2725 {
2726 Uint32 Pixel;
2727 unsigned sR;
2728 unsigned sG;
2729 unsigned sB;
2730 unsigned dR;
2731 unsigned dG;
2732 unsigned dB;
2733 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2734 if(sA && Pixel != ckey) {
2735 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2736 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2737 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2738 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2739 }
2740 src += srcbpp;
2741 dst += dstbpp;
2742 },
2743 width);
2744 src += srcskip;
2745 dst += dstskip;
2746 }
2747}
2748
2749/* General (slow) N->N blending with pixel alpha */
2750static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2751{
2752 int width = info->d_width;
2753 int height = info->d_height;
2754 Uint8 *src = info->s_pixels;
2755 int srcskip = info->s_skip;
2756 Uint8 *dst = info->d_pixels;
2757 int dstskip = info->d_skip;
2758 SDL_PixelFormat *srcfmt = info->src;
2759 SDL_PixelFormat *dstfmt = info->dst;
2760
2761 int srcbpp;
2762 int dstbpp;
2763
2764 /* Set up some basic variables */
2765 srcbpp = srcfmt->BytesPerPixel;
2766 dstbpp = dstfmt->BytesPerPixel;
2767
2768 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2769 quite right. for <8bpp source alpha, it gets them very wrong
2770 (check all macros!)
2771 It is unclear whether there is a good general solution that doesn't
2772 need a branch (or a divide). */
2773 while ( height-- ) {
2774 DUFFS_LOOP4(
2775 {
2776 Uint32 Pixel;
2777 unsigned sR;
2778 unsigned sG;
2779 unsigned sB;
2780 unsigned dR;
2781 unsigned dG;
2782 unsigned dB;
2783 unsigned sA;
2784 unsigned dA;
2785 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2786 if(sA) {
2787 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2788 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2789 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2790 }
2791 src += srcbpp;
2792 dst += dstbpp;
2793 },
2794 width);
2795 src += srcskip;
2796 dst += dstskip;
2797 }
2798}
2799
2800
2801SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2802{
2803 SDL_PixelFormat *sf = surface->format;
2804 SDL_PixelFormat *df = surface->map->dst->format;
2805
2806 if(sf->Amask == 0) {
2807 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2808 if(df->BytesPerPixel == 1)
2809 return BlitNto1SurfaceAlphaKey;
2810 else
2811#if SDL_ALTIVEC_BLITTERS
2812 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2813 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2814 return Blit32to32SurfaceAlphaKeyAltivec;
2815 else
2816#endif
2817 return BlitNtoNSurfaceAlphaKey;
2818 } else {
2819 /* Per-surface alpha blits */
2820 switch(df->BytesPerPixel) {
2821 case 1:
2822 return BlitNto1SurfaceAlpha;
2823
2824 case 2:
2825 if(surface->map->identity) {
2826 if(df->Gmask == 0x7e0)
2827 {
2828#if MMX_ASMBLIT
2829 if(SDL_HasMMX())
2830 return Blit565to565SurfaceAlphaMMX;
2831 else
2832#endif
2833 return Blit565to565SurfaceAlpha;
2834 }
2835 else if(df->Gmask == 0x3e0)
2836 {
2837#if MMX_ASMBLIT
2838 if(SDL_HasMMX())
2839 return Blit555to555SurfaceAlphaMMX;
2840 else
2841#endif
2842 return Blit555to555SurfaceAlpha;
2843 }
2844 }
2845 return BlitNtoNSurfaceAlpha;
2846
2847 case 4:
2848 if(sf->Rmask == df->Rmask
2849 && sf->Gmask == df->Gmask
2850 && sf->Bmask == df->Bmask
2851 && sf->BytesPerPixel == 4)
2852 {
2853#if MMX_ASMBLIT
2854 if(sf->Rshift % 8 == 0
2855 && sf->Gshift % 8 == 0
2856 && sf->Bshift % 8 == 0
2857 && SDL_HasMMX())
2858 return BlitRGBtoRGBSurfaceAlphaMMX;
bdfa6989 2859#endif
2860#ifdef __ARM_NEON__
2861 if(sf->Rshift % 8 == 0
2862 && sf->Gshift % 8 == 0
2863 && sf->Bshift % 8 == 0)
c85a5291 2864 {
bdfa6989 2865 return BlitARGBtoXRGBalphaS_neon;
c85a5291 2866 }
e14743d1 2867#endif
2868 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2869 {
2870#if SDL_ALTIVEC_BLITTERS
2871 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2872 && SDL_HasAltiVec())
2873 return BlitRGBtoRGBSurfaceAlphaAltivec;
2874#endif
2875 return BlitRGBtoRGBSurfaceAlpha;
2876 }
2877 }
c85a5291 2878#ifdef __ARM_NEON__
2879 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2880 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0)
2881 {
2882 return BlitABGRtoXRGBalphaS_neon;
2883 }
2884#endif
e14743d1 2885#if SDL_ALTIVEC_BLITTERS
2886 if((sf->BytesPerPixel == 4) &&
2887 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2888 return Blit32to32SurfaceAlphaAltivec;
2889 else
2890#endif
2891 return BlitNtoNSurfaceAlpha;
2892
2893 case 3:
2894 default:
2895 return BlitNtoNSurfaceAlpha;
2896 }
2897 }
2898 } else {
2899 /* Per-pixel alpha blits */
2900 switch(df->BytesPerPixel) {
2901 case 1:
2902 return BlitNto1PixelAlpha;
2903
2904 case 2:
2905#if SDL_ALTIVEC_BLITTERS
2906 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2907 df->Gmask == 0x7e0 &&
2908 df->Bmask == 0x1f && SDL_HasAltiVec())
2909 return Blit32to565PixelAlphaAltivec;
2910 else
2c4e54dd 2911#endif
2912#ifdef __ARM_NEON__
2913 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2914 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0) {
2915 if((sf->Bmask >> 3) == df->Bmask || (sf->Rmask >> 3) == df->Rmask)
2916 return BlitARGBtoRGB565alpha_neon;
2917 else
2918 return BlitABGRtoRGB565alpha_neon;
2919 }
2920 else
e14743d1 2921#endif
2922 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2923 && sf->Gmask == 0xff00
2924 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2925 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2926 if(df->Gmask == 0x7e0)
2927 return BlitARGBto565PixelAlpha;
2928 else if(df->Gmask == 0x3e0)
2929 return BlitARGBto555PixelAlpha;
2930 }
2931 return BlitNtoNPixelAlpha;
2932
2933 case 4:
2934 if(sf->Rmask == df->Rmask
2935 && sf->Gmask == df->Gmask
2936 && sf->Bmask == df->Bmask
2937 && sf->BytesPerPixel == 4)
2938 {
2939#if MMX_ASMBLIT
2940 if(sf->Rshift % 8 == 0
2941 && sf->Gshift % 8 == 0
2942 && sf->Bshift % 8 == 0
2943 && sf->Ashift % 8 == 0
2944 && sf->Aloss == 0)
2945 {
2946 if(SDL_Has3DNow())
2947 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2948 if(SDL_HasMMX())
2949 return BlitRGBtoRGBPixelAlphaMMX;
2950 }
c85a5291 2951#endif
2952#ifdef __ARM_NEON__
2953 if(sf->Rshift % 8 == 0
2954 && sf->Gshift % 8 == 0
2955 && sf->Bshift % 8 == 0
2956 && sf->Ashift % 8 == 0)
2957 {
2958 return BlitARGBtoXRGBalpha_neon;
2959 }
e14743d1 2960#endif
2961 if(sf->Amask == 0xff000000)
2962 {
2963#if SDL_ALTIVEC_BLITTERS
2964 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2965 && SDL_HasAltiVec())
2966 return BlitRGBtoRGBPixelAlphaAltivec;
2967#endif
2968 return BlitRGBtoRGBPixelAlpha;
2969 }
2970 }
a1f34081 2971#ifdef __ARM_NEON__
c85a5291 2972 if (sf->Gmask == df->Gmask && sf->Rmask == df->Bmask && sf->Bmask == df->Rmask
2973 && sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0
2974 && sf->Amask == 0xff000000)
a1f34081 2975 {
2976 return BlitABGRtoXRGBalpha_neon;
2977 }
2978#endif
e14743d1 2979#if SDL_ALTIVEC_BLITTERS
2980 if (sf->Amask && sf->BytesPerPixel == 4 &&
2981 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2982 return Blit32to32PixelAlphaAltivec;
2983 else
2984#endif
2985 return BlitNtoNPixelAlpha;
2986
2987 case 3:
2988 default:
2989 return BlitNtoNPixelAlpha;
2990 }
2991 }
2992}
2993