blit: improve BlitNtoNSurfaceAlphaKey for 16bpp
[sdl_omap.git] / src / video / SDL_blit_A.c
CommitLineData
e14743d1 1/*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2009 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35# define MMX_ASMBLIT 1
36# define GCC_ASMBLIT 1
37# elif defined(_MSC_VER) && defined(_M_IX86)
38# if (_MSC_VER <= 1200)
39# include <malloc.h>
40# if defined(_mm_free)
41# define HAVE_MMINTRIN_H 1
42# endif
43# else /* Visual Studio > VC6 always has mmintrin.h */
44# define HAVE_MMINTRIN_H 1
45# endif
46# if HAVE_MMINTRIN_H
47# define MMX_ASMBLIT 1
48# define MSVC_ASMBLIT 1
49# endif
50# endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
64/* N->1 blending with per-surface alpha */
65static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
66{
67 int width = info->d_width;
68 int height = info->d_height;
69 Uint8 *src = info->s_pixels;
70 int srcskip = info->s_skip;
71 Uint8 *dst = info->d_pixels;
72 int dstskip = info->d_skip;
73 Uint8 *palmap = info->table;
74 SDL_PixelFormat *srcfmt = info->src;
75 SDL_PixelFormat *dstfmt = info->dst;
76 int srcbpp = srcfmt->BytesPerPixel;
77
78 const unsigned A = srcfmt->alpha;
79
80 while ( height-- ) {
81 DUFFS_LOOP4(
82 {
83 Uint32 Pixel;
84 unsigned sR;
85 unsigned sG;
86 unsigned sB;
87 unsigned dR;
88 unsigned dG;
89 unsigned dB;
90 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
91 dR = dstfmt->palette->colors[*dst].r;
92 dG = dstfmt->palette->colors[*dst].g;
93 dB = dstfmt->palette->colors[*dst].b;
94 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
95 dR &= 0xff;
96 dG &= 0xff;
97 dB &= 0xff;
98 /* Pack RGB into 8bit pixel */
99 if ( palmap == NULL ) {
100 *dst =((dR>>5)<<(3+2))|
101 ((dG>>5)<<(2))|
102 ((dB>>6)<<(0));
103 } else {
104 *dst = palmap[((dR>>5)<<(3+2))|
105 ((dG>>5)<<(2)) |
106 ((dB>>6)<<(0))];
107 }
108 dst++;
109 src += srcbpp;
110 },
111 width);
112 src += srcskip;
113 dst += dstskip;
114 }
115}
116
117/* N->1 blending with pixel alpha */
118static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
119{
120 int width = info->d_width;
121 int height = info->d_height;
122 Uint8 *src = info->s_pixels;
123 int srcskip = info->s_skip;
124 Uint8 *dst = info->d_pixels;
125 int dstskip = info->d_skip;
126 Uint8 *palmap = info->table;
127 SDL_PixelFormat *srcfmt = info->src;
128 SDL_PixelFormat *dstfmt = info->dst;
129 int srcbpp = srcfmt->BytesPerPixel;
130
131 /* FIXME: fix alpha bit field expansion here too? */
132 while ( height-- ) {
133 DUFFS_LOOP4(
134 {
135 Uint32 Pixel;
136 unsigned sR;
137 unsigned sG;
138 unsigned sB;
139 unsigned sA;
140 unsigned dR;
141 unsigned dG;
142 unsigned dB;
143 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
144 dR = dstfmt->palette->colors[*dst].r;
145 dG = dstfmt->palette->colors[*dst].g;
146 dB = dstfmt->palette->colors[*dst].b;
147 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
148 dR &= 0xff;
149 dG &= 0xff;
150 dB &= 0xff;
151 /* Pack RGB into 8bit pixel */
152 if ( palmap == NULL ) {
153 *dst =((dR>>5)<<(3+2))|
154 ((dG>>5)<<(2))|
155 ((dB>>6)<<(0));
156 } else {
157 *dst = palmap[((dR>>5)<<(3+2))|
158 ((dG>>5)<<(2)) |
159 ((dB>>6)<<(0)) ];
160 }
161 dst++;
162 src += srcbpp;
163 },
164 width);
165 src += srcskip;
166 dst += dstskip;
167 }
168}
169
170/* colorkeyed N->1 blending with per-surface alpha */
171static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
172{
173 int width = info->d_width;
174 int height = info->d_height;
175 Uint8 *src = info->s_pixels;
176 int srcskip = info->s_skip;
177 Uint8 *dst = info->d_pixels;
178 int dstskip = info->d_skip;
179 Uint8 *palmap = info->table;
180 SDL_PixelFormat *srcfmt = info->src;
181 SDL_PixelFormat *dstfmt = info->dst;
182 int srcbpp = srcfmt->BytesPerPixel;
183 Uint32 ckey = srcfmt->colorkey;
184
185 const int A = srcfmt->alpha;
186
187 while ( height-- ) {
188 DUFFS_LOOP(
189 {
190 Uint32 Pixel;
191 unsigned sR;
192 unsigned sG;
193 unsigned sB;
194 unsigned dR;
195 unsigned dG;
196 unsigned dB;
197 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
198 if ( Pixel != ckey ) {
199 dR = dstfmt->palette->colors[*dst].r;
200 dG = dstfmt->palette->colors[*dst].g;
201 dB = dstfmt->palette->colors[*dst].b;
202 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
203 dR &= 0xff;
204 dG &= 0xff;
205 dB &= 0xff;
206 /* Pack RGB into 8bit pixel */
207 if ( palmap == NULL ) {
208 *dst =((dR>>5)<<(3+2))|
209 ((dG>>5)<<(2)) |
210 ((dB>>6)<<(0));
211 } else {
212 *dst = palmap[((dR>>5)<<(3+2))|
213 ((dG>>5)<<(2)) |
214 ((dB>>6)<<(0)) ];
215 }
216 }
217 dst++;
218 src += srcbpp;
219 },
220 width);
221 src += srcskip;
222 dst += dstskip;
223 }
224}
225
226#if GCC_ASMBLIT
227/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
228static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
229{
230 int width = info->d_width;
231 int height = info->d_height;
232 Uint32 *srcp = (Uint32 *)info->s_pixels;
233 int srcskip = info->s_skip >> 2;
234 Uint32 *dstp = (Uint32 *)info->d_pixels;
235 int dstskip = info->d_skip >> 2;
236 Uint32 dalpha = info->dst->Amask;
237 Uint64 load;
238
239 load = 0x00fefefe00fefefeULL;/* alpha128 mask */
240 movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
241 load = 0x0001010100010101ULL;/* !alpha128 mask */
242 movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
243 movd_m2r(dalpha, mm7); /* dst alpha mask */
244 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
245 while(height--) {
246 DUFFS_LOOP_DOUBLE2(
247 {
248 Uint32 s = *srcp++;
249 Uint32 d = *dstp;
250 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
251 + (s & d & 0x00010101)) | dalpha;
252 },{
253 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
254 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
255
256 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
257 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
258
259 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
260 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
261 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
262 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
263 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
264 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
265 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
266
267 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
268 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
269 dstp += 2;
270 srcp += 2;
271 }, width);
272 srcp += srcskip;
273 dstp += dstskip;
274 }
275 emms();
276}
277
278/* fast RGB888->(A)RGB888 blending with surface alpha */
279static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
280{
281 SDL_PixelFormat* df = info->dst;
282 unsigned alpha = info->src->alpha;
283
284 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
285 /* only call a128 version when R,G,B occupy lower bits */
286 BlitRGBtoRGBSurfaceAlpha128MMX(info);
287 } else {
288 int width = info->d_width;
289 int height = info->d_height;
290 Uint32 *srcp = (Uint32 *)info->s_pixels;
291 int srcskip = info->s_skip >> 2;
292 Uint32 *dstp = (Uint32 *)info->d_pixels;
293 int dstskip = info->d_skip >> 2;
294
295 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
296 /* form the alpha mult */
297 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
298 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
299 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
300 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
301 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
302 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
303 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
304 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
305 movd_m2r(df->Amask, mm7); /* dst alpha mask */
306 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
307
308 while(height--) {
309 DUFFS_LOOP_DOUBLE2({
310 /* One Pixel Blend */
311 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
312 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
313 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
314 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
315
316 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
317 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
318 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
319 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
320
321 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
322 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
323 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
324 ++srcp;
325 ++dstp;
326 },{
327 /* Two Pixels Blend */
328 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
329 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
330 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
331 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
332
333 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
334 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
335 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
336 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
337
338 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
339 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
340 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
341 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
342
343 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
344 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
345 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
346 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
347
348 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
349 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
350
351 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
352
353 srcp += 2;
354 dstp += 2;
355 }, width);
356 srcp += srcskip;
357 dstp += dstskip;
358 }
359 emms();
360 }
361}
362
363/* fast ARGB888->(A)RGB888 blending with pixel alpha */
364static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
365{
366 int width = info->d_width;
367 int height = info->d_height;
368 Uint32 *srcp = (Uint32 *)info->s_pixels;
369 int srcskip = info->s_skip >> 2;
370 Uint32 *dstp = (Uint32 *)info->d_pixels;
371 int dstskip = info->d_skip >> 2;
372 SDL_PixelFormat* sf = info->src;
373 Uint32 amask = sf->Amask;
374
375 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
376 /* form multiplication mask */
377 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
378 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
379 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
380 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
381 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
382 /* form channel masks */
383 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
384 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
385 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
386 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
387 /* get alpha channel shift */
388 __asm__ __volatile__ (
389 "movd %0, %%mm5"
390 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
391
392 while(height--) {
393 DUFFS_LOOP4({
394 Uint32 alpha = *srcp & amask;
395 /* FIXME: Here we special-case opaque alpha since the
396 compositioning used (>>8 instead of /255) doesn't handle
397 it correctly. Also special-case alpha=0 for speed?
398 Benchmark this! */
399 if(alpha == 0) {
400 /* do nothing */
401 } else if(alpha == amask) {
402 /* opaque alpha -- copy RGB, keep dst alpha */
403 /* using MMX here to free up regular registers for other things */
404 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
405 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
406 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
407 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
408 por_r2r(mm1, mm2); /* src | dst -> mm2 */
409 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
410 } else {
411 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
412 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
413
414 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
415 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
416
417 __asm__ __volatile__ (
418 "movd %0, %%mm4"
419 : : "r" (alpha) ); /* 0000A000 -> mm4 */
420 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
421 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
422 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
423 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
424
425 /* blend */
426 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
427 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
428 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
429 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
430
431 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
432 movd_r2m(mm2, *dstp);/* mm2 -> dst */
433 }
434 ++srcp;
435 ++dstp;
436 }, width);
437 srcp += srcskip;
438 dstp += dstskip;
439 }
440 emms();
441}
442/* End GCC_ASMBLIT */
443
444#elif MSVC_ASMBLIT
445/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
446static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
447{
448 int width = info->d_width;
449 int height = info->d_height;
450 Uint32 *srcp = (Uint32 *)info->s_pixels;
451 int srcskip = info->s_skip >> 2;
452 Uint32 *dstp = (Uint32 *)info->d_pixels;
453 int dstskip = info->d_skip >> 2;
454 Uint32 dalpha = info->dst->Amask;
455
456 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
457
458 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
459 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
460 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
461
462 while (height--) {
463 int n = width;
464 if ( n & 1 ) {
465 Uint32 s = *srcp++;
466 Uint32 d = *dstp;
467 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
468 + (s & d & 0x00010101)) | dalpha;
469 n--;
470 }
471
472 for (n >>= 1; n > 0; --n) {
473 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
474 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
475
476 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
477 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
478
479 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
480 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
481 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
482 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
483
484 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
485 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
486 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
487 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
488
489 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
490 dstp += 2;
491 srcp += 2;
492 }
493
494 srcp += srcskip;
495 dstp += dstskip;
496 }
497 _mm_empty();
498}
499
500/* fast RGB888->(A)RGB888 blending with surface alpha */
501static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
502{
503 SDL_PixelFormat* df = info->dst;
504 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
505 unsigned alpha = info->src->alpha;
506
507 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
508 /* only call a128 version when R,G,B occupy lower bits */
509 BlitRGBtoRGBSurfaceAlpha128MMX(info);
510 } else {
511 int width = info->d_width;
512 int height = info->d_height;
513 Uint32 *srcp = (Uint32 *)info->s_pixels;
514 int srcskip = info->s_skip >> 2;
515 Uint32 *dstp = (Uint32 *)info->d_pixels;
516 int dstskip = info->d_skip >> 2;
517 Uint32 dalpha = df->Amask;
518 Uint32 amult;
519
520 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
521
522 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
523 /* form the alpha mult */
524 amult = alpha | (alpha << 8);
525 amult = amult | (amult << 16);
526 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
527 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
528 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
529 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
530 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
531
532 while (height--) {
533 int n = width;
534 if (n & 1) {
535 /* One Pixel Blend */
536 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
537 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
538
539 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
540 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
541
542 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
543 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
544 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
545 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
546
547 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
548 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
549 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
550
551 ++srcp;
552 ++dstp;
553
554 n--;
555 }
556
557 for (n >>= 1; n > 0; --n) {
558 /* Two Pixels Blend */
559 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
560 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
561 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
562 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
563
564 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
565 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
566 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
567 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
568
569 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
570 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
571 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
572 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
573
574 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
575 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
576 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
577 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
578
579 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
580 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
581
582 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
583
584 srcp += 2;
585 dstp += 2;
586 }
587 srcp += srcskip;
588 dstp += dstskip;
589 }
590 _mm_empty();
591 }
592}
593
594/* fast ARGB888->(A)RGB888 blending with pixel alpha */
595static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
596{
597 int width = info->d_width;
598 int height = info->d_height;
599 Uint32 *srcp = (Uint32 *)info->s_pixels;
600 int srcskip = info->s_skip >> 2;
601 Uint32 *dstp = (Uint32 *)info->d_pixels;
602 int dstskip = info->d_skip >> 2;
603 SDL_PixelFormat* sf = info->src;
604 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
605 Uint32 amask = sf->Amask;
606 Uint32 ashift = sf->Ashift;
607 Uint64 multmask;
608
609 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
610
611 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
612 multmask = ~(0xFFFFi64 << (ashift * 2));
613 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
614
615 while(height--) {
616 DUFFS_LOOP4({
617 Uint32 alpha = *srcp & amask;
618 if (alpha == 0) {
619 /* do nothing */
620 } else if (alpha == amask) {
621 /* opaque alpha -- copy RGB, keep dst alpha */
622 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
623 } else {
624 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
625 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
626
627 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
628 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
629
630 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
631 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
632 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
633 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
634 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
635
636 /* blend */
637 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
638 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
639 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
640 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
641 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
642
643 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
644 }
645 ++srcp;
646 ++dstp;
647 }, width);
648 srcp += srcskip;
649 dstp += dstskip;
650 }
651 _mm_empty();
652}
653/* End MSVC_ASMBLIT */
654
655#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
656
657#if SDL_ALTIVEC_BLITTERS
658#if __MWERKS__
659#pragma altivec_model on
660#endif
661#if HAVE_ALTIVEC_H
662#include <altivec.h>
663#endif
664#include <assert.h>
665
666#if (defined(__MACOSX__) && (__GNUC__ < 4))
667 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
668 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
669 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
670 (vector unsigned short) ( a,b,c,d,e,f,g,h )
671#else
672 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
673 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
674 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
675 (vector unsigned short) { a,b,c,d,e,f,g,h }
676#endif
677
678#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
679#define VECPRINT(msg, v) do { \
680 vector unsigned int tmpvec = (vector unsigned int)(v); \
681 unsigned int *vp = (unsigned int *)&tmpvec; \
682 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
683} while (0)
684
685/* the permuation vector that takes the high bytes out of all the appropriate shorts
686 (vector unsigned char)(
687 0x00, 0x10, 0x02, 0x12,
688 0x04, 0x14, 0x06, 0x16,
689 0x08, 0x18, 0x0A, 0x1A,
690 0x0C, 0x1C, 0x0E, 0x1E );
691*/
692#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
693#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
694#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
695#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
696 ? vec_lvsl(0, src) \
697 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
698
699
700#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
701 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
702 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
703 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
704 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
705 /* valpha2 is 255-alpha */ \
706 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
707 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
708 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
709 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
710 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
711 /* add source and dest */ \
712 vtemp1 = vec_add(vtemp1, vtemp3); \
713 vtemp2 = vec_add(vtemp2, vtemp4); \
714 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
715 vtemp1 = vec_add(vtemp1, v1_16); \
716 vtemp3 = vec_sr(vtemp1, v8_16); \
717 vtemp1 = vec_add(vtemp1, vtemp3); \
718 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
719 vtemp2 = vec_add(vtemp2, v1_16); \
720 vtemp4 = vec_sr(vtemp2, v8_16); \
721 vtemp2 = vec_add(vtemp2, vtemp4); \
722 /* (>>8) and get ARGBARGBARGBARGB */ \
723 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
724} while (0)
725
726/* Calculate the permute vector used for 32->32 swizzling */
727static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
728 const SDL_PixelFormat *dstfmt)
729{
730 /*
731 * We have to assume that the bits that aren't used by other
732 * colors is alpha, and it's one complete byte, since some formats
733 * leave alpha with a zero mask, but we should still swizzle the bits.
734 */
735 /* ARGB */
736 const static struct SDL_PixelFormat default_pixel_format = {
737 NULL, 0, 0,
738 0, 0, 0, 0,
739 16, 8, 0, 24,
740 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
741 0, 0};
742 if (!srcfmt) {
743 srcfmt = &default_pixel_format;
744 }
745 if (!dstfmt) {
746 dstfmt = &default_pixel_format;
747 }
748 const vector unsigned char plus = VECUINT8_LITERAL
749 ( 0x00, 0x00, 0x00, 0x00,
750 0x04, 0x04, 0x04, 0x04,
751 0x08, 0x08, 0x08, 0x08,
752 0x0C, 0x0C, 0x0C, 0x0C );
753 vector unsigned char vswiz;
754 vector unsigned int srcvec;
755#define RESHIFT(X) (3 - ((X) >> 3))
756 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
757 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
758 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
759 Uint32 amask;
760 /* Use zero for alpha if either surface doesn't have alpha */
761 if (dstfmt->Amask) {
762 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
763 } else {
764 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
765 }
766#undef RESHIFT
767 ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
768 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
769 return(vswiz);
770}
771
772static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
773{
774 int height = info->d_height;
775 Uint8 *src = (Uint8 *)info->s_pixels;
776 int srcskip = info->s_skip;
777 Uint8 *dst = (Uint8 *)info->d_pixels;
778 int dstskip = info->d_skip;
779 SDL_PixelFormat *srcfmt = info->src;
780
781 vector unsigned char v0 = vec_splat_u8(0);
782 vector unsigned short v8_16 = vec_splat_u16(8);
783 vector unsigned short v1_16 = vec_splat_u16(1);
784 vector unsigned short v2_16 = vec_splat_u16(2);
785 vector unsigned short v3_16 = vec_splat_u16(3);
786 vector unsigned int v8_32 = vec_splat_u32(8);
787 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
788 vector unsigned short v3f = VECUINT16_LITERAL(
789 0x003f, 0x003f, 0x003f, 0x003f,
790 0x003f, 0x003f, 0x003f, 0x003f);
791 vector unsigned short vfc = VECUINT16_LITERAL(
792 0x00fc, 0x00fc, 0x00fc, 0x00fc,
793 0x00fc, 0x00fc, 0x00fc, 0x00fc);
794
795 /*
796 0x10 - 0x1f is the alpha
797 0x00 - 0x0e evens are the red
798 0x01 - 0x0f odds are zero
799 */
800 vector unsigned char vredalpha1 = VECUINT8_LITERAL(
801 0x10, 0x00, 0x01, 0x01,
802 0x10, 0x02, 0x01, 0x01,
803 0x10, 0x04, 0x01, 0x01,
804 0x10, 0x06, 0x01, 0x01
805 );
806 vector unsigned char vredalpha2 = (vector unsigned char)(
807 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
808 );
809 /*
810 0x00 - 0x0f is ARxx ARxx ARxx ARxx
811 0x11 - 0x0f odds are blue
812 */
813 vector unsigned char vblue1 = VECUINT8_LITERAL(
814 0x00, 0x01, 0x02, 0x11,
815 0x04, 0x05, 0x06, 0x13,
816 0x08, 0x09, 0x0a, 0x15,
817 0x0c, 0x0d, 0x0e, 0x17
818 );
819 vector unsigned char vblue2 = (vector unsigned char)(
820 vec_add((vector unsigned int)vblue1, v8_32)
821 );
822 /*
823 0x00 - 0x0f is ARxB ARxB ARxB ARxB
824 0x10 - 0x0e evens are green
825 */
826 vector unsigned char vgreen1 = VECUINT8_LITERAL(
827 0x00, 0x01, 0x10, 0x03,
828 0x04, 0x05, 0x12, 0x07,
829 0x08, 0x09, 0x14, 0x0b,
830 0x0c, 0x0d, 0x16, 0x0f
831 );
832 vector unsigned char vgreen2 = (vector unsigned char)(
833 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
834 );
835 vector unsigned char vgmerge = VECUINT8_LITERAL(
836 0x00, 0x02, 0x00, 0x06,
837 0x00, 0x0a, 0x00, 0x0e,
838 0x00, 0x12, 0x00, 0x16,
839 0x00, 0x1a, 0x00, 0x1e);
840 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
841 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
842 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
843
844 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
845 vf800 = vec_sl(vf800, vec_splat_u16(8));
846
847 while(height--) {
848 int extrawidth;
849 vector unsigned char valigner;
850 vector unsigned char vsrc;
851 vector unsigned char voverflow;
852 int width = info->d_width;
853
854#define ONE_PIXEL_BLEND(condition, widthvar) \
855 while (condition) { \
856 Uint32 Pixel; \
857 unsigned sR, sG, sB, dR, dG, dB, sA; \
858 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
859 if(sA) { \
860 unsigned short dstpixel = *((unsigned short *)dst); \
861 dR = (dstpixel >> 8) & 0xf8; \
862 dG = (dstpixel >> 3) & 0xfc; \
863 dB = (dstpixel << 3) & 0xf8; \
864 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
865 *((unsigned short *)dst) = ( \
866 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
867 ); \
868 } \
869 src += 4; \
870 dst += 2; \
871 widthvar--; \
872 }
873 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
874 extrawidth = (width % 8);
875 valigner = VEC_ALIGNER(src);
876 vsrc = (vector unsigned char)vec_ld(0, src);
877 width -= extrawidth;
878 while (width) {
879 vector unsigned char valpha;
880 vector unsigned char vsrc1, vsrc2;
881 vector unsigned char vdst1, vdst2;
882 vector unsigned short vR, vG, vB;
883 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
884
885 /* Load 8 pixels from src as ARGB */
886 voverflow = (vector unsigned char)vec_ld(15, src);
887 vsrc = vec_perm(vsrc, voverflow, valigner);
888 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
889 src += 16;
890 vsrc = (vector unsigned char)vec_ld(15, src);
891 voverflow = vec_perm(voverflow, vsrc, valigner);
892 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
893 src += 16;
894
895 /* Load 8 pixels from dst as XRGB */
896 voverflow = vec_ld(0, dst);
897 vR = vec_and((vector unsigned short)voverflow, vf800);
898 vB = vec_sl((vector unsigned short)voverflow, v3_16);
899 vG = vec_sl(vB, v2_16);
900 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
901 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
902 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
903 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
904 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
905 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
906
907 /* Alpha blend 8 pixels as ARGB */
908 valpha = vec_perm(vsrc1, v0, valphaPermute);
909 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
910 valpha = vec_perm(vsrc2, v0, valphaPermute);
911 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
912
913 /* Convert 8 pixels to 565 */
914 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
915 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
916 vgpixel = vec_and(vgpixel, vfc);
917 vgpixel = vec_sl(vgpixel, v3_16);
918 vrpixel = vec_sl(vpixel, v1_16);
919 vrpixel = vec_and(vrpixel, vf800);
920 vbpixel = vec_and(vpixel, v3f);
921 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
922 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
923
924 /* Store 8 pixels */
925 vec_st(vdst1, 0, dst);
926
927 width -= 8;
928 dst += 16;
929 }
930 ONE_PIXEL_BLEND((extrawidth), extrawidth);
931#undef ONE_PIXEL_BLEND
932 src += srcskip;
933 dst += dstskip;
934 }
935}
936
937static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
938{
939 unsigned alpha = info->src->alpha;
940 int height = info->d_height;
941 Uint32 *srcp = (Uint32 *)info->s_pixels;
942 int srcskip = info->s_skip >> 2;
943 Uint32 *dstp = (Uint32 *)info->d_pixels;
944 int dstskip = info->d_skip >> 2;
945 SDL_PixelFormat *srcfmt = info->src;
946 SDL_PixelFormat *dstfmt = info->dst;
947 unsigned sA = srcfmt->alpha;
948 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
949 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
950 Uint32 ckey = info->src->colorkey;
951 vector unsigned char mergePermute;
952 vector unsigned char vsrcPermute;
953 vector unsigned char vdstPermute;
954 vector unsigned char vsdstPermute;
955 vector unsigned char valpha;
956 vector unsigned char valphamask;
957 vector unsigned char vbits;
958 vector unsigned char v0;
959 vector unsigned short v1;
960 vector unsigned short v8;
961 vector unsigned int vckey;
962 vector unsigned int vrgbmask;
963
964 mergePermute = VEC_MERGE_PERMUTE();
965 v0 = vec_splat_u8(0);
966 v1 = vec_splat_u16(1);
967 v8 = vec_splat_u16(8);
968
969 /* set the alpha to 255 on the destination surf */
970 valphamask = VEC_ALPHA_MASK();
971
972 vsrcPermute = calc_swizzle32(srcfmt, NULL);
973 vdstPermute = calc_swizzle32(NULL, dstfmt);
974 vsdstPermute = calc_swizzle32(dstfmt, NULL);
975
976 /* set a vector full of alpha and 255-alpha */
977 ((unsigned char *)&valpha)[0] = alpha;
978 valpha = vec_splat(valpha, 0);
979 vbits = (vector unsigned char)vec_splat_s8(-1);
980
981 ckey &= rgbmask;
982 ((unsigned int *)(char*)&vckey)[0] = ckey;
983 vckey = vec_splat(vckey, 0);
984 ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
985 vrgbmask = vec_splat(vrgbmask, 0);
986
987 while(height--) {
988 int width = info->d_width;
989#define ONE_PIXEL_BLEND(condition, widthvar) \
990 while (condition) { \
991 Uint32 Pixel; \
992 unsigned sR, sG, sB, dR, dG, dB; \
993 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
994 if(sA && Pixel != ckey) { \
995 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
996 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
997 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
998 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
999 } \
1000 dstp++; \
1001 srcp++; \
1002 widthvar--; \
1003 }
1004 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1005 if (width > 0) {
1006 int extrawidth = (width % 4);
1007 vector unsigned char valigner = VEC_ALIGNER(srcp);
1008 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1009 width -= extrawidth;
1010 while (width) {
1011 vector unsigned char vsel;
1012 vector unsigned char voverflow;
1013 vector unsigned char vd;
1014 vector unsigned char vd_orig;
1015
1016 /* s = *srcp */
1017 voverflow = (vector unsigned char)vec_ld(15, srcp);
1018 vs = vec_perm(vs, voverflow, valigner);
1019
1020 /* vsel is set for items that match the key */
1021 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1022 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1023
1024 /* permute to source format */
1025 vs = vec_perm(vs, valpha, vsrcPermute);
1026
1027 /* d = *dstp */
1028 vd = (vector unsigned char)vec_ld(0, dstp);
1029 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1030
1031 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1032
1033 /* set the alpha channel to full on */
1034 vd = vec_or(vd, valphamask);
1035
1036 /* mask out color key */
1037 vd = vec_sel(vd, vd_orig, vsel);
1038
1039 /* permute to dest format */
1040 vd = vec_perm(vd, vbits, vdstPermute);
1041
1042 /* *dstp = res */
1043 vec_st((vector unsigned int)vd, 0, dstp);
1044
1045 srcp += 4;
1046 dstp += 4;
1047 width -= 4;
1048 vs = voverflow;
1049 }
1050 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1051 }
1052#undef ONE_PIXEL_BLEND
1053
1054 srcp += srcskip;
1055 dstp += dstskip;
1056 }
1057}
1058
1059
1060static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1061{
1062 int width = info->d_width;
1063 int height = info->d_height;
1064 Uint32 *srcp = (Uint32 *)info->s_pixels;
1065 int srcskip = info->s_skip >> 2;
1066 Uint32 *dstp = (Uint32 *)info->d_pixels;
1067 int dstskip = info->d_skip >> 2;
1068 SDL_PixelFormat *srcfmt = info->src;
1069 SDL_PixelFormat *dstfmt = info->dst;
1070 vector unsigned char mergePermute;
1071 vector unsigned char valphaPermute;
1072 vector unsigned char vsrcPermute;
1073 vector unsigned char vdstPermute;
1074 vector unsigned char vsdstPermute;
1075 vector unsigned char valphamask;
1076 vector unsigned char vpixelmask;
1077 vector unsigned char v0;
1078 vector unsigned short v1;
1079 vector unsigned short v8;
1080
1081 v0 = vec_splat_u8(0);
1082 v1 = vec_splat_u16(1);
1083 v8 = vec_splat_u16(8);
1084 mergePermute = VEC_MERGE_PERMUTE();
1085 valphamask = VEC_ALPHA_MASK();
1086 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1087 vpixelmask = vec_nor(valphamask, v0);
1088 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1089 vdstPermute = calc_swizzle32(NULL, dstfmt);
1090 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1091
1092 while ( height-- ) {
1093 width = info->d_width;
1094#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1095 Uint32 Pixel; \
1096 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1097 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1098 if(sA) { \
1099 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1100 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1101 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1102 } \
1103 ++srcp; \
1104 ++dstp; \
1105 widthvar--; \
1106 }
1107 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1108 if (width > 0) {
1109 /* vsrcPermute */
1110 /* vdstPermute */
1111 int extrawidth = (width % 4);
1112 vector unsigned char valigner = VEC_ALIGNER(srcp);
1113 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1114 width -= extrawidth;
1115 while (width) {
1116 vector unsigned char voverflow;
1117 vector unsigned char vd;
1118 vector unsigned char valpha;
1119 vector unsigned char vdstalpha;
1120 /* s = *srcp */
1121 voverflow = (vector unsigned char)vec_ld(15, srcp);
1122 vs = vec_perm(vs, voverflow, valigner);
1123 vs = vec_perm(vs, v0, vsrcPermute);
1124
1125 valpha = vec_perm(vs, v0, valphaPermute);
1126
1127 /* d = *dstp */
1128 vd = (vector unsigned char)vec_ld(0, dstp);
1129 vd = vec_perm(vd, v0, vsdstPermute);
1130 vdstalpha = vec_and(vd, valphamask);
1131
1132 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1133
1134 /* set the alpha to the dest alpha */
1135 vd = vec_and(vd, vpixelmask);
1136 vd = vec_or(vd, vdstalpha);
1137 vd = vec_perm(vd, v0, vdstPermute);
1138
1139 /* *dstp = res */
1140 vec_st((vector unsigned int)vd, 0, dstp);
1141
1142 srcp += 4;
1143 dstp += 4;
1144 width -= 4;
1145 vs = voverflow;
1146
1147 }
1148 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1149 }
1150 srcp += srcskip;
1151 dstp += dstskip;
1152#undef ONE_PIXEL_BLEND
1153 }
1154}
1155
1156/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1157static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1158{
1159 int width = info->d_width;
1160 int height = info->d_height;
1161 Uint32 *srcp = (Uint32 *)info->s_pixels;
1162 int srcskip = info->s_skip >> 2;
1163 Uint32 *dstp = (Uint32 *)info->d_pixels;
1164 int dstskip = info->d_skip >> 2;
1165 vector unsigned char mergePermute;
1166 vector unsigned char valphaPermute;
1167 vector unsigned char valphamask;
1168 vector unsigned char vpixelmask;
1169 vector unsigned char v0;
1170 vector unsigned short v1;
1171 vector unsigned short v8;
1172 v0 = vec_splat_u8(0);
1173 v1 = vec_splat_u16(1);
1174 v8 = vec_splat_u16(8);
1175 mergePermute = VEC_MERGE_PERMUTE();
1176 valphamask = VEC_ALPHA_MASK();
1177 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1178
1179
1180 vpixelmask = vec_nor(valphamask, v0);
1181 while(height--) {
1182 width = info->d_width;
1183#define ONE_PIXEL_BLEND(condition, widthvar) \
1184 while ((condition)) { \
1185 Uint32 dalpha; \
1186 Uint32 d; \
1187 Uint32 s1; \
1188 Uint32 d1; \
1189 Uint32 s = *srcp; \
1190 Uint32 alpha = s >> 24; \
1191 if(alpha) { \
1192 if(alpha == SDL_ALPHA_OPAQUE) { \
1193 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1194 } else { \
1195 d = *dstp; \
1196 dalpha = d & 0xff000000; \
1197 s1 = s & 0xff00ff; \
1198 d1 = d & 0xff00ff; \
1199 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1200 s &= 0xff00; \
1201 d &= 0xff00; \
1202 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1203 *dstp = d1 | d | dalpha; \
1204 } \
1205 } \
1206 ++srcp; \
1207 ++dstp; \
1208 widthvar--; \
1209 }
1210 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1211 if (width > 0) {
1212 int extrawidth = (width % 4);
1213 vector unsigned char valigner = VEC_ALIGNER(srcp);
1214 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1215 width -= extrawidth;
1216 while (width) {
1217 vector unsigned char voverflow;
1218 vector unsigned char vd;
1219 vector unsigned char valpha;
1220 vector unsigned char vdstalpha;
1221 /* s = *srcp */
1222 voverflow = (vector unsigned char)vec_ld(15, srcp);
1223 vs = vec_perm(vs, voverflow, valigner);
1224
1225 valpha = vec_perm(vs, v0, valphaPermute);
1226
1227 /* d = *dstp */
1228 vd = (vector unsigned char)vec_ld(0, dstp);
1229 vdstalpha = vec_and(vd, valphamask);
1230
1231 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1232
1233 /* set the alpha to the dest alpha */
1234 vd = vec_and(vd, vpixelmask);
1235 vd = vec_or(vd, vdstalpha);
1236
1237 /* *dstp = res */
1238 vec_st((vector unsigned int)vd, 0, dstp);
1239
1240 srcp += 4;
1241 dstp += 4;
1242 width -= 4;
1243 vs = voverflow;
1244 }
1245 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1246 }
1247 srcp += srcskip;
1248 dstp += dstskip;
1249 }
1250#undef ONE_PIXEL_BLEND
1251}
1252
1253static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1254{
1255 /* XXX : 6 */
1256 unsigned alpha = info->src->alpha;
1257 int height = info->d_height;
1258 Uint32 *srcp = (Uint32 *)info->s_pixels;
1259 int srcskip = info->s_skip >> 2;
1260 Uint32 *dstp = (Uint32 *)info->d_pixels;
1261 int dstskip = info->d_skip >> 2;
1262 SDL_PixelFormat *srcfmt = info->src;
1263 SDL_PixelFormat *dstfmt = info->dst;
1264 unsigned sA = srcfmt->alpha;
1265 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1266 vector unsigned char mergePermute;
1267 vector unsigned char vsrcPermute;
1268 vector unsigned char vdstPermute;
1269 vector unsigned char vsdstPermute;
1270 vector unsigned char valpha;
1271 vector unsigned char valphamask;
1272 vector unsigned char vbits;
1273 vector unsigned short v1;
1274 vector unsigned short v8;
1275
1276 mergePermute = VEC_MERGE_PERMUTE();
1277 v1 = vec_splat_u16(1);
1278 v8 = vec_splat_u16(8);
1279
1280 /* set the alpha to 255 on the destination surf */
1281 valphamask = VEC_ALPHA_MASK();
1282
1283 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1284 vdstPermute = calc_swizzle32(NULL, dstfmt);
1285 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1286
1287 /* set a vector full of alpha and 255-alpha */
1288 ((unsigned char *)&valpha)[0] = alpha;
1289 valpha = vec_splat(valpha, 0);
1290 vbits = (vector unsigned char)vec_splat_s8(-1);
1291
1292 while(height--) {
1293 int width = info->d_width;
1294#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1295 Uint32 Pixel; \
1296 unsigned sR, sG, sB, dR, dG, dB; \
1297 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1298 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1299 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1300 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1301 ++srcp; \
1302 ++dstp; \
1303 widthvar--; \
1304 }
1305 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1306 if (width > 0) {
1307 int extrawidth = (width % 4);
1308 vector unsigned char valigner = VEC_ALIGNER(srcp);
1309 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1310 width -= extrawidth;
1311 while (width) {
1312 vector unsigned char voverflow;
1313 vector unsigned char vd;
1314
1315 /* s = *srcp */
1316 voverflow = (vector unsigned char)vec_ld(15, srcp);
1317 vs = vec_perm(vs, voverflow, valigner);
1318 vs = vec_perm(vs, valpha, vsrcPermute);
1319
1320 /* d = *dstp */
1321 vd = (vector unsigned char)vec_ld(0, dstp);
1322 vd = vec_perm(vd, vd, vsdstPermute);
1323
1324 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1325
1326 /* set the alpha channel to full on */
1327 vd = vec_or(vd, valphamask);
1328 vd = vec_perm(vd, vbits, vdstPermute);
1329
1330 /* *dstp = res */
1331 vec_st((vector unsigned int)vd, 0, dstp);
1332
1333 srcp += 4;
1334 dstp += 4;
1335 width -= 4;
1336 vs = voverflow;
1337 }
1338 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1339 }
1340#undef ONE_PIXEL_BLEND
1341
1342 srcp += srcskip;
1343 dstp += dstskip;
1344 }
1345
1346}
1347
1348
1349/* fast RGB888->(A)RGB888 blending */
1350static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1351{
1352 unsigned alpha = info->src->alpha;
1353 int height = info->d_height;
1354 Uint32 *srcp = (Uint32 *)info->s_pixels;
1355 int srcskip = info->s_skip >> 2;
1356 Uint32 *dstp = (Uint32 *)info->d_pixels;
1357 int dstskip = info->d_skip >> 2;
1358 vector unsigned char mergePermute;
1359 vector unsigned char valpha;
1360 vector unsigned char valphamask;
1361 vector unsigned short v1;
1362 vector unsigned short v8;
1363
1364 mergePermute = VEC_MERGE_PERMUTE();
1365 v1 = vec_splat_u16(1);
1366 v8 = vec_splat_u16(8);
1367
1368 /* set the alpha to 255 on the destination surf */
1369 valphamask = VEC_ALPHA_MASK();
1370
1371 /* set a vector full of alpha and 255-alpha */
1372 ((unsigned char *)&valpha)[0] = alpha;
1373 valpha = vec_splat(valpha, 0);
1374
1375 while(height--) {
1376 int width = info->d_width;
1377#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1378 Uint32 s = *srcp; \
1379 Uint32 d = *dstp; \
1380 Uint32 s1 = s & 0xff00ff; \
1381 Uint32 d1 = d & 0xff00ff; \
1382 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1383 & 0xff00ff; \
1384 s &= 0xff00; \
1385 d &= 0xff00; \
1386 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1387 *dstp = d1 | d | 0xff000000; \
1388 ++srcp; \
1389 ++dstp; \
1390 widthvar--; \
1391 }
1392 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1393 if (width > 0) {
1394 int extrawidth = (width % 4);
1395 vector unsigned char valigner = VEC_ALIGNER(srcp);
1396 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1397 width -= extrawidth;
1398 while (width) {
1399 vector unsigned char voverflow;
1400 vector unsigned char vd;
1401
1402 /* s = *srcp */
1403 voverflow = (vector unsigned char)vec_ld(15, srcp);
1404 vs = vec_perm(vs, voverflow, valigner);
1405
1406 /* d = *dstp */
1407 vd = (vector unsigned char)vec_ld(0, dstp);
1408
1409 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1410
1411 /* set the alpha channel to full on */
1412 vd = vec_or(vd, valphamask);
1413
1414 /* *dstp = res */
1415 vec_st((vector unsigned int)vd, 0, dstp);
1416
1417 srcp += 4;
1418 dstp += 4;
1419 width -= 4;
1420 vs = voverflow;
1421 }
1422 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1423 }
1424#undef ONE_PIXEL_BLEND
1425
1426 srcp += srcskip;
1427 dstp += dstskip;
1428 }
1429}
1430#if __MWERKS__
1431#pragma altivec_model off
1432#endif
1433#endif /* SDL_ALTIVEC_BLITTERS */
1434
1435/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1436static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1437{
1438 int width = info->d_width;
1439 int height = info->d_height;
1440 Uint32 *srcp = (Uint32 *)info->s_pixels;
1441 int srcskip = info->s_skip >> 2;
1442 Uint32 *dstp = (Uint32 *)info->d_pixels;
1443 int dstskip = info->d_skip >> 2;
1444
1445 while(height--) {
1446 DUFFS_LOOP4({
1447 Uint32 s = *srcp++;
1448 Uint32 d = *dstp;
1449 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1450 + (s & d & 0x00010101)) | 0xff000000;
1451 }, width);
1452 srcp += srcskip;
1453 dstp += dstskip;
1454 }
1455}
1456
1457/* fast RGB888->(A)RGB888 blending with surface alpha */
1458static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1459{
1460 unsigned alpha = info->src->alpha;
1461 if(alpha == 128) {
1462 BlitRGBtoRGBSurfaceAlpha128(info);
1463 } else {
1464 int width = info->d_width;
1465 int height = info->d_height;
1466 Uint32 *srcp = (Uint32 *)info->s_pixels;
1467 int srcskip = info->s_skip >> 2;
1468 Uint32 *dstp = (Uint32 *)info->d_pixels;
1469 int dstskip = info->d_skip >> 2;
1470 Uint32 s;
1471 Uint32 d;
1472 Uint32 s1;
1473 Uint32 d1;
1474
1475 while(height--) {
1476 DUFFS_LOOP_DOUBLE2({
1477 /* One Pixel Blend */
1478 s = *srcp;
1479 d = *dstp;
1480 s1 = s & 0xff00ff;
1481 d1 = d & 0xff00ff;
1482 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1483 & 0xff00ff;
1484 s &= 0xff00;
1485 d &= 0xff00;
1486 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1487 *dstp = d1 | d | 0xff000000;
1488 ++srcp;
1489 ++dstp;
1490 },{
1491 /* Two Pixels Blend */
1492 s = *srcp;
1493 d = *dstp;
1494 s1 = s & 0xff00ff;
1495 d1 = d & 0xff00ff;
1496 d1 += (s1 - d1) * alpha >> 8;
1497 d1 &= 0xff00ff;
1498
1499 s = ((s & 0xff00) >> 8) |
1500 ((srcp[1] & 0xff00) << 8);
1501 d = ((d & 0xff00) >> 8) |
1502 ((dstp[1] & 0xff00) << 8);
1503 d += (s - d) * alpha >> 8;
1504 d &= 0x00ff00ff;
1505
1506 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1507 ++srcp;
1508
1509 s1 = *srcp;
1510 d1 = *dstp;
1511 s1 &= 0xff00ff;
1512 d1 &= 0xff00ff;
1513 d1 += (s1 - d1) * alpha >> 8;
1514 d1 &= 0xff00ff;
1515
1516 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1517 ++srcp;
1518 ++dstp;
1519 }, width);
1520 srcp += srcskip;
1521 dstp += dstskip;
1522 }
1523 }
1524}
1525
1526/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1527static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1528{
1529 int width = info->d_width;
1530 int height = info->d_height;
1531 Uint32 *srcp = (Uint32 *)info->s_pixels;
1532 int srcskip = info->s_skip >> 2;
1533 Uint32 *dstp = (Uint32 *)info->d_pixels;
1534 int dstskip = info->d_skip >> 2;
1535
1536 while(height--) {
1537 DUFFS_LOOP4({
1538 Uint32 dalpha;
1539 Uint32 d;
1540 Uint32 s1;
1541 Uint32 d1;
1542 Uint32 s = *srcp;
1543 Uint32 alpha = s >> 24;
1544 /* FIXME: Here we special-case opaque alpha since the
1545 compositioning used (>>8 instead of /255) doesn't handle
1546 it correctly. Also special-case alpha=0 for speed?
1547 Benchmark this! */
1548 if(alpha) {
1549 if(alpha == SDL_ALPHA_OPAQUE) {
1550 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1551 } else {
1552 /*
1553 * take out the middle component (green), and process
1554 * the other two in parallel. One multiply less.
1555 */
1556 d = *dstp;
1557 dalpha = d & 0xff000000;
1558 s1 = s & 0xff00ff;
1559 d1 = d & 0xff00ff;
1560 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1561 s &= 0xff00;
1562 d &= 0xff00;
1563 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1564 *dstp = d1 | d | dalpha;
1565 }
1566 }
1567 ++srcp;
1568 ++dstp;
1569 }, width);
1570 srcp += srcskip;
1571 dstp += dstskip;
1572 }
1573}
1574
1575#if GCC_ASMBLIT
1576/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1577static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1578{
1579 int width = info->d_width;
1580 int height = info->d_height;
1581 Uint32 *srcp = (Uint32 *)info->s_pixels;
1582 int srcskip = info->s_skip >> 2;
1583 Uint32 *dstp = (Uint32 *)info->d_pixels;
1584 int dstskip = info->d_skip >> 2;
1585 SDL_PixelFormat* sf = info->src;
1586 Uint32 amask = sf->Amask;
1587
1588 __asm__ (
1589 /* make mm6 all zeros. */
1590 "pxor %%mm6, %%mm6\n"
1591
1592 /* Make a mask to preserve the alpha. */
1593 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1594 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1595 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1596 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1597 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1598
1599 /* form channel masks */
1600 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1601 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1602 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1603 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1604
1605 /* get alpha channel shift */
1606 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1607
1608 : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1609
1610 while(height--) {
1611
1612 DUFFS_LOOP4({
1613 Uint32 alpha;
1614
1615 __asm__ (
1616 "prefetch 64(%0)\n"
1617 "prefetch 64(%1)\n"
1618 : : "r" (srcp), "r" (dstp) );
1619
1620 alpha = *srcp & amask;
1621 /* FIXME: Here we special-case opaque alpha since the
1622 compositioning used (>>8 instead of /255) doesn't handle
1623 it correctly. Also special-case alpha=0 for speed?
1624 Benchmark this! */
1625 if(alpha == 0) {
1626 /* do nothing */
1627 }
1628 else if(alpha == amask) {
1629 /* opaque alpha -- copy RGB, keep dst alpha */
1630 /* using MMX here to free up regular registers for other things */
1631 __asm__ (
1632 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1633 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1634 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1635 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1636 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1637 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1638
1639 : : "r" (srcp), "r" (dstp) );
1640 }
1641
1642 else {
1643 __asm__ (
1644 /* load in the source, and dst. */
1645 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1646 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1647
1648 /* Move the src alpha into mm2 */
1649
1650 /* if supporting pshufw */
1651 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1652 /*"psrlw $8, %%mm2\n" */
1653
1654 /* else: */
1655 "movd %2, %%mm2\n"
1656 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1657 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1658 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1659 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1660
1661 /* move the colors into words. */
1662 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1663 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1664
1665 /* src - dst */
1666 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1667
1668 /* A * (src-dst) */
1669 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1670 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1671 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1672
1673 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1674
1675 "movd %%mm0, (%1)\n" /* result in mm0 */
1676
1677 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1678
1679 }
1680 ++srcp;
1681 ++dstp;
1682 }, width);
1683 srcp += srcskip;
1684 dstp += dstskip;
1685 }
1686
1687 __asm__ (
1688 "emms\n"
1689 : );
1690}
1691/* End GCC_ASMBLIT*/
1692
1693#elif MSVC_ASMBLIT
1694/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1695static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1696{
1697 int width = info->d_width;
1698 int height = info->d_height;
1699 Uint32 *srcp = (Uint32 *)info->s_pixels;
1700 int srcskip = info->s_skip >> 2;
1701 Uint32 *dstp = (Uint32 *)info->d_pixels;
1702 int dstskip = info->d_skip >> 2;
1703 SDL_PixelFormat* sf = info->src;
1704 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1705 Uint32 amask = sf->Amask;
1706 Uint32 ashift = sf->Ashift;
1707 Uint64 multmask;
1708
1709 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1710
1711 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1712 multmask = ~(0xFFFFi64 << (ashift * 2));
1713 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1714
1715 while(height--) {
1716 DUFFS_LOOP4({
1717 Uint32 alpha;
1718
1719 _m_prefetch(srcp + 16);
1720 _m_prefetch(dstp + 16);
1721
1722 alpha = *srcp & amask;
1723 if (alpha == 0) {
1724 /* do nothing */
1725 } else if (alpha == amask) {
1726 /* copy RGB, keep dst alpha */
1727 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1728 } else {
1729 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1730 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1731
1732 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1733 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1734
1735 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1736 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1737 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1738 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1739 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1740
1741 /* blend */
1742 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1743 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1744 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1745 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1746 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1747
1748 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1749 }
1750 ++srcp;
1751 ++dstp;
1752 }, width);
1753 srcp += srcskip;
1754 dstp += dstskip;
1755 }
1756 _mm_empty();
1757}
1758/* End MSVC_ASMBLIT */
1759
1760#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1761
1762/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1763
1764/* blend a single 16 bit pixel at 50% */
1765#define BLEND16_50(d, s, mask) \
1766 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1767
1768/* blend two 16 bit pixels at 50% */
1769#define BLEND2x16_50(d, s, mask) \
1770 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1771 + (s & d & (~(mask | mask << 16))))
1772
1773static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1774{
1775 int width = info->d_width;
1776 int height = info->d_height;
1777 Uint16 *srcp = (Uint16 *)info->s_pixels;
1778 int srcskip = info->s_skip >> 1;
1779 Uint16 *dstp = (Uint16 *)info->d_pixels;
1780 int dstskip = info->d_skip >> 1;
1781
1782 while(height--) {
1783 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1784 /*
1785 * Source and destination not aligned, pipeline it.
1786 * This is mostly a win for big blits but no loss for
1787 * small ones
1788 */
1789 Uint32 prev_sw;
1790 int w = width;
1791
1792 /* handle odd destination */
1793 if((uintptr_t)dstp & 2) {
1794 Uint16 d = *dstp, s = *srcp;
1795 *dstp = BLEND16_50(d, s, mask);
1796 dstp++;
1797 srcp++;
1798 w--;
1799 }
1800 srcp++; /* srcp is now 32-bit aligned */
1801
1802 /* bootstrap pipeline with first halfword */
1803 prev_sw = ((Uint32 *)srcp)[-1];
1804
1805 while(w > 1) {
1806 Uint32 sw, dw, s;
1807 sw = *(Uint32 *)srcp;
1808 dw = *(Uint32 *)dstp;
1809#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1810 s = (prev_sw << 16) + (sw >> 16);
1811#else
1812 s = (prev_sw >> 16) + (sw << 16);
1813#endif
1814 prev_sw = sw;
1815 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1816 dstp += 2;
1817 srcp += 2;
1818 w -= 2;
1819 }
1820
1821 /* final pixel if any */
1822 if(w) {
1823 Uint16 d = *dstp, s;
1824#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1825 s = (Uint16)prev_sw;
1826#else
1827 s = (Uint16)(prev_sw >> 16);
1828#endif
1829 *dstp = BLEND16_50(d, s, mask);
1830 srcp++;
1831 dstp++;
1832 }
1833 srcp += srcskip - 1;
1834 dstp += dstskip;
1835 } else {
1836 /* source and destination are aligned */
1837 int w = width;
1838
1839 /* first odd pixel? */
1840 if((uintptr_t)srcp & 2) {
1841 Uint16 d = *dstp, s = *srcp;
1842 *dstp = BLEND16_50(d, s, mask);
1843 srcp++;
1844 dstp++;
1845 w--;
1846 }
1847 /* srcp and dstp are now 32-bit aligned */
1848
1849 while(w > 1) {
1850 Uint32 sw = *(Uint32 *)srcp;
1851 Uint32 dw = *(Uint32 *)dstp;
1852 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1853 srcp += 2;
1854 dstp += 2;
1855 w -= 2;
1856 }
1857
1858 /* last odd pixel? */
1859 if(w) {
1860 Uint16 d = *dstp, s = *srcp;
1861 *dstp = BLEND16_50(d, s, mask);
1862 srcp++;
1863 dstp++;
1864 }
1865 srcp += srcskip;
1866 dstp += dstskip;
1867 }
1868 }
1869}
1870
1871#if GCC_ASMBLIT
1872/* fast RGB565->RGB565 blending with surface alpha */
1873static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1874{
1875 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1876 if(alpha == 128) {
1877 Blit16to16SurfaceAlpha128(info, 0xf7de);
1878 } else {
1879 int width = info->d_width;
1880 int height = info->d_height;
1881 Uint16 *srcp = (Uint16 *)info->s_pixels;
1882 int srcskip = info->s_skip >> 1;
1883 Uint16 *dstp = (Uint16 *)info->d_pixels;
1884 int dstskip = info->d_skip >> 1;
1885 Uint32 s, d;
1886 Uint64 load;
1887
1888 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1889 load = alpha;
1890 alpha >>= 3; /* downscale alpha to 5 bits */
1891
1892 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1893 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1894 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1895 /* position alpha to allow for mullo and mulhi on diff channels
1896 to reduce the number of operations */
1897 psllq_i2r(3, mm0);
1898
1899 /* Setup the 565 color channel masks */
1900 load = 0x07E007E007E007E0ULL;
1901 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1902 load = 0x001F001F001F001FULL;
1903 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1904 while(height--) {
1905 DUFFS_LOOP_QUATRO2(
1906 {
1907 s = *srcp++;
1908 d = *dstp;
1909 /*
1910 * shift out the middle component (green) to
1911 * the high 16 bits, and process all three RGB
1912 * components at the same time.
1913 */
1914 s = (s | s << 16) & 0x07e0f81f;
1915 d = (d | d << 16) & 0x07e0f81f;
1916 d += (s - d) * alpha >> 5;
1917 d &= 0x07e0f81f;
1918 *dstp++ = d | d >> 16;
1919 },{
1920 s = *srcp++;
1921 d = *dstp;
1922 /*
1923 * shift out the middle component (green) to
1924 * the high 16 bits, and process all three RGB
1925 * components at the same time.
1926 */
1927 s = (s | s << 16) & 0x07e0f81f;
1928 d = (d | d << 16) & 0x07e0f81f;
1929 d += (s - d) * alpha >> 5;
1930 d &= 0x07e0f81f;
1931 *dstp++ = d | d >> 16;
1932 s = *srcp++;
1933 d = *dstp;
1934 /*
1935 * shift out the middle component (green) to
1936 * the high 16 bits, and process all three RGB
1937 * components at the same time.
1938 */
1939 s = (s | s << 16) & 0x07e0f81f;
1940 d = (d | d << 16) & 0x07e0f81f;
1941 d += (s - d) * alpha >> 5;
1942 d &= 0x07e0f81f;
1943 *dstp++ = d | d >> 16;
1944 },{
1945 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1946 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1947
1948 /* red -- does not need a mask since the right shift clears
1949 the uninteresting bits */
1950 movq_r2r(mm2, mm5); /* src -> mm5 */
1951 movq_r2r(mm3, mm6); /* dst -> mm6 */
1952 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1953 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1954
1955 /* blend */
1956 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1957 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1958 /* alpha used is actually 11 bits
1959 11 + 5 = 16 bits, so the sign bits are lost */
1960 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1961 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1962 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1963
1964 movq_r2r(mm6, mm1); /* save new reds in dsts */
1965
1966 /* green -- process the bits in place */
1967 movq_r2r(mm2, mm5); /* src -> mm5 */
1968 movq_r2r(mm3, mm6); /* dst -> mm6 */
1969 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1970 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1971
1972 /* blend */
1973 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1974 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1975 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1976 bits are gone and the sign bits present */
1977 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1978 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1979
1980 por_r2r(mm6, mm1); /* save new greens in dsts */
1981
1982 /* blue */
1983 movq_r2r(mm2, mm5); /* src -> mm5 */
1984 movq_r2r(mm3, mm6); /* dst -> mm6 */
1985 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1986 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1987
1988 /* blend */
1989 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1990 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1991 /* 11 + 5 = 16 bits, so the sign bits are lost and
1992 the interesting bits will need to be MASKed */
1993 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1994 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1995 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1996
1997 por_r2r(mm6, mm1); /* save new blues in dsts */
1998
1999 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2000
2001 srcp += 4;
2002 dstp += 4;
2003 }, width);
2004 srcp += srcskip;
2005 dstp += dstskip;
2006 }
2007 emms();
2008 }
2009}
2010
2011/* fast RGB555->RGB555 blending with surface alpha */
2012static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2013{
2014 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2015 if(alpha == 128) {
2016 Blit16to16SurfaceAlpha128(info, 0xfbde);
2017 } else {
2018 int width = info->d_width;
2019 int height = info->d_height;
2020 Uint16 *srcp = (Uint16 *)info->s_pixels;
2021 int srcskip = info->s_skip >> 1;
2022 Uint16 *dstp = (Uint16 *)info->d_pixels;
2023 int dstskip = info->d_skip >> 1;
2024 Uint32 s, d;
2025 Uint64 load;
2026
2027 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2028 load = alpha;
2029 alpha >>= 3; /* downscale alpha to 5 bits */
2030
2031 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2032 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2033 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2034 /* position alpha to allow for mullo and mulhi on diff channels
2035 to reduce the number of operations */
2036 psllq_i2r(3, mm0);
2037
2038 /* Setup the 555 color channel masks */
2039 load = 0x03E003E003E003E0ULL;
2040 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2041 load = 0x001F001F001F001FULL;
2042 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2043 while(height--) {
2044 DUFFS_LOOP_QUATRO2(
2045 {
2046 s = *srcp++;
2047 d = *dstp;
2048 /*
2049 * shift out the middle component (green) to
2050 * the high 16 bits, and process all three RGB
2051 * components at the same time.
2052 */
2053 s = (s | s << 16) & 0x03e07c1f;
2054 d = (d | d << 16) & 0x03e07c1f;
2055 d += (s - d) * alpha >> 5;
2056 d &= 0x03e07c1f;
2057 *dstp++ = d | d >> 16;
2058 },{
2059 s = *srcp++;
2060 d = *dstp;
2061 /*
2062 * shift out the middle component (green) to
2063 * the high 16 bits, and process all three RGB
2064 * components at the same time.
2065 */
2066 s = (s | s << 16) & 0x03e07c1f;
2067 d = (d | d << 16) & 0x03e07c1f;
2068 d += (s - d) * alpha >> 5;
2069 d &= 0x03e07c1f;
2070 *dstp++ = d | d >> 16;
2071 s = *srcp++;
2072 d = *dstp;
2073 /*
2074 * shift out the middle component (green) to
2075 * the high 16 bits, and process all three RGB
2076 * components at the same time.
2077 */
2078 s = (s | s << 16) & 0x03e07c1f;
2079 d = (d | d << 16) & 0x03e07c1f;
2080 d += (s - d) * alpha >> 5;
2081 d &= 0x03e07c1f;
2082 *dstp++ = d | d >> 16;
2083 },{
2084 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2085 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2086
2087 /* red -- process the bits in place */
2088 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2089 /* by reusing the GREEN mask we free up another mmx
2090 register to accumulate the result */
2091
2092 movq_r2r(mm2, mm5); /* src -> mm5 */
2093 movq_r2r(mm3, mm6); /* dst -> mm6 */
2094 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2095 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2096
2097 /* blend */
2098 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2099 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2100 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2101 cleared by a MASK below */
2102 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2103 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2104 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2105
2106 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2107
2108 movq_r2r(mm6, mm1); /* save new reds in dsts */
2109
2110 /* green -- process the bits in place */
2111 movq_r2r(mm2, mm5); /* src -> mm5 */
2112 movq_r2r(mm3, mm6); /* dst -> mm6 */
2113 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2114 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2115
2116 /* blend */
2117 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2118 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2119 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2120 bits are gone and the sign bits present */
2121 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2122 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2123
2124 por_r2r(mm6, mm1); /* save new greens in dsts */
2125
2126 /* blue */
2127 movq_r2r(mm2, mm5); /* src -> mm5 */
2128 movq_r2r(mm3, mm6); /* dst -> mm6 */
2129 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2130 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2131
2132 /* blend */
2133 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2134 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2135 /* 11 + 5 = 16 bits, so the sign bits are lost and
2136 the interesting bits will need to be MASKed */
2137 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2138 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2139 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2140
2141 por_r2r(mm6, mm1); /* save new blues in dsts */
2142
2143 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2144
2145 srcp += 4;
2146 dstp += 4;
2147 }, width);
2148 srcp += srcskip;
2149 dstp += dstskip;
2150 }
2151 emms();
2152 }
2153}
2154/* End GCC_ASMBLIT */
2155
2156#elif MSVC_ASMBLIT
2157/* fast RGB565->RGB565 blending with surface alpha */
2158static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2159{
2160 unsigned alpha = info->src->alpha;
2161 if(alpha == 128) {
2162 Blit16to16SurfaceAlpha128(info, 0xf7de);
2163 } else {
2164 int width = info->d_width;
2165 int height = info->d_height;
2166 Uint16 *srcp = (Uint16 *)info->s_pixels;
2167 int srcskip = info->s_skip >> 1;
2168 Uint16 *dstp = (Uint16 *)info->d_pixels;
2169 int dstskip = info->d_skip >> 1;
2170 Uint32 s, d;
2171
2172 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2173
2174 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2175 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2176 alpha >>= 3; /* downscale alpha to 5 bits */
2177
2178 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2179 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2180 /* position alpha to allow for mullo and mulhi on diff channels
2181 to reduce the number of operations */
2182 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2183
2184 /* Setup the 565 color channel masks */
2185 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2186 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2187
2188 while(height--) {
2189 DUFFS_LOOP_QUATRO2(
2190 {
2191 s = *srcp++;
2192 d = *dstp;
2193 /*
2194 * shift out the middle component (green) to
2195 * the high 16 bits, and process all three RGB
2196 * components at the same time.
2197 */
2198 s = (s | s << 16) & 0x07e0f81f;
2199 d = (d | d << 16) & 0x07e0f81f;
2200 d += (s - d) * alpha >> 5;
2201 d &= 0x07e0f81f;
2202 *dstp++ = (Uint16)(d | d >> 16);
2203 },{
2204 s = *srcp++;
2205 d = *dstp;
2206 /*
2207 * shift out the middle component (green) to
2208 * the high 16 bits, and process all three RGB
2209 * components at the same time.
2210 */
2211 s = (s | s << 16) & 0x07e0f81f;
2212 d = (d | d << 16) & 0x07e0f81f;
2213 d += (s - d) * alpha >> 5;
2214 d &= 0x07e0f81f;
2215 *dstp++ = (Uint16)(d | d >> 16);
2216 s = *srcp++;
2217 d = *dstp;
2218 /*
2219 * shift out the middle component (green) to
2220 * the high 16 bits, and process all three RGB
2221 * components at the same time.
2222 */
2223 s = (s | s << 16) & 0x07e0f81f;
2224 d = (d | d << 16) & 0x07e0f81f;
2225 d += (s - d) * alpha >> 5;
2226 d &= 0x07e0f81f;
2227 *dstp++ = (Uint16)(d | d >> 16);
2228 },{
2229 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2230 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2231
2232 /* red */
2233 src2 = src1;
2234 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2235
2236 dst2 = dst1;
2237 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2238
2239 /* blend */
2240 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2241 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2242 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2243 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2244 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2245
2246 mm_res = dst2; /* RED -> mm_res */
2247
2248 /* green -- process the bits in place */
2249 src2 = src1;
2250 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2251
2252 dst2 = dst1;
2253 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2254
2255 /* blend */
2256 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2257 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2258 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2259 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2260
2261 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2262
2263 /* blue */
2264 src2 = src1;
2265 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2266
2267 dst2 = dst1;
2268 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2269
2270 /* blend */
2271 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2272 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2273 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2274 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2275 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2276
2277 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2278
2279 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2280
2281 srcp += 4;
2282 dstp += 4;
2283 }, width);
2284 srcp += srcskip;
2285 dstp += dstskip;
2286 }
2287 _mm_empty();
2288 }
2289}
2290
2291/* fast RGB555->RGB555 blending with surface alpha */
2292static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2293{
2294 unsigned alpha = info->src->alpha;
2295 if(alpha == 128) {
2296 Blit16to16SurfaceAlpha128(info, 0xfbde);
2297 } else {
2298 int width = info->d_width;
2299 int height = info->d_height;
2300 Uint16 *srcp = (Uint16 *)info->s_pixels;
2301 int srcskip = info->s_skip >> 1;
2302 Uint16 *dstp = (Uint16 *)info->d_pixels;
2303 int dstskip = info->d_skip >> 1;
2304 Uint32 s, d;
2305
2306 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2307
2308 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2309 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2310 alpha >>= 3; /* downscale alpha to 5 bits */
2311
2312 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2313 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2314 /* position alpha to allow for mullo and mulhi on diff channels
2315 to reduce the number of operations */
2316 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2317
2318 /* Setup the 555 color channel masks */
2319 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2320 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2321 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2322
2323 while(height--) {
2324 DUFFS_LOOP_QUATRO2(
2325 {
2326 s = *srcp++;
2327 d = *dstp;
2328 /*
2329 * shift out the middle component (green) to
2330 * the high 16 bits, and process all three RGB
2331 * components at the same time.
2332 */
2333 s = (s | s << 16) & 0x03e07c1f;
2334 d = (d | d << 16) & 0x03e07c1f;
2335 d += (s - d) * alpha >> 5;
2336 d &= 0x03e07c1f;
2337 *dstp++ = (Uint16)(d | d >> 16);
2338 },{
2339 s = *srcp++;
2340 d = *dstp;
2341 /*
2342 * shift out the middle component (green) to
2343 * the high 16 bits, and process all three RGB
2344 * components at the same time.
2345 */
2346 s = (s | s << 16) & 0x03e07c1f;
2347 d = (d | d << 16) & 0x03e07c1f;
2348 d += (s - d) * alpha >> 5;
2349 d &= 0x03e07c1f;
2350 *dstp++ = (Uint16)(d | d >> 16);
2351 s = *srcp++;
2352 d = *dstp;
2353 /*
2354 * shift out the middle component (green) to
2355 * the high 16 bits, and process all three RGB
2356 * components at the same time.
2357 */
2358 s = (s | s << 16) & 0x03e07c1f;
2359 d = (d | d << 16) & 0x03e07c1f;
2360 d += (s - d) * alpha >> 5;
2361 d &= 0x03e07c1f;
2362 *dstp++ = (Uint16)(d | d >> 16);
2363 },{
2364 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2365 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2366
2367 /* red -- process the bits in place */
2368 src2 = src1;
2369 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2370
2371 dst2 = dst1;
2372 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2373
2374 /* blend */
2375 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2376 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2377 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2378 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2379 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2380
2381 mm_res = dst2; /* RED -> mm_res */
2382
2383 /* green -- process the bits in place */
2384 src2 = src1;
2385 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2386
2387 dst2 = dst1;
2388 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2389
2390 /* blend */
2391 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2392 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2393 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2394 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2395
2396 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2397
2398 /* blue */
2399 src2 = src1; /* src -> src2 */
2400 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2401
2402 dst2 = dst1; /* dst -> dst2 */
2403 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2404
2405 /* blend */
2406 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2407 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2408 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2409 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2410 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2411
2412 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2413
2414 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2415
2416 srcp += 4;
2417 dstp += 4;
2418 }, width);
2419 srcp += srcskip;
2420 dstp += dstskip;
2421 }
2422 _mm_empty();
2423 }
2424}
2425#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2426
2427/* fast RGB565->RGB565 blending with surface alpha */
2428static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2429{
2430 unsigned alpha = info->src->alpha;
2431 if(alpha == 128) {
2432 Blit16to16SurfaceAlpha128(info, 0xf7de);
2433 } else {
2434 int width = info->d_width;
2435 int height = info->d_height;
2436 Uint16 *srcp = (Uint16 *)info->s_pixels;
2437 int srcskip = info->s_skip >> 1;
2438 Uint16 *dstp = (Uint16 *)info->d_pixels;
2439 int dstskip = info->d_skip >> 1;
2440 alpha >>= 3; /* downscale alpha to 5 bits */
2441
2442 while(height--) {
2443 DUFFS_LOOP4({
2444 Uint32 s = *srcp++;
2445 Uint32 d = *dstp;
2446 /*
2447 * shift out the middle component (green) to
2448 * the high 16 bits, and process all three RGB
2449 * components at the same time.
2450 */
2451 s = (s | s << 16) & 0x07e0f81f;
2452 d = (d | d << 16) & 0x07e0f81f;
2453 d += (s - d) * alpha >> 5;
2454 d &= 0x07e0f81f;
2455 *dstp++ = (Uint16)(d | d >> 16);
2456 }, width);
2457 srcp += srcskip;
2458 dstp += dstskip;
2459 }
2460 }
2461}
2462
2463/* fast RGB555->RGB555 blending with surface alpha */
2464static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2465{
2466 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2467 if(alpha == 128) {
2468 Blit16to16SurfaceAlpha128(info, 0xfbde);
2469 } else {
2470 int width = info->d_width;
2471 int height = info->d_height;
2472 Uint16 *srcp = (Uint16 *)info->s_pixels;
2473 int srcskip = info->s_skip >> 1;
2474 Uint16 *dstp = (Uint16 *)info->d_pixels;
2475 int dstskip = info->d_skip >> 1;
2476 alpha >>= 3; /* downscale alpha to 5 bits */
2477
2478 while(height--) {
2479 DUFFS_LOOP4({
2480 Uint32 s = *srcp++;
2481 Uint32 d = *dstp;
2482 /*
2483 * shift out the middle component (green) to
2484 * the high 16 bits, and process all three RGB
2485 * components at the same time.
2486 */
2487 s = (s | s << 16) & 0x03e07c1f;
2488 d = (d | d << 16) & 0x03e07c1f;
2489 d += (s - d) * alpha >> 5;
2490 d &= 0x03e07c1f;
2491 *dstp++ = (Uint16)(d | d >> 16);
2492 }, width);
2493 srcp += srcskip;
2494 dstp += dstskip;
2495 }
2496 }
2497}
2498
2499/* fast ARGB8888->RGB565 blending with pixel alpha */
2500static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2501{
2502 int width = info->d_width;
2503 int height = info->d_height;
2504 Uint32 *srcp = (Uint32 *)info->s_pixels;
2505 int srcskip = info->s_skip >> 2;
2506 Uint16 *dstp = (Uint16 *)info->d_pixels;
2507 int dstskip = info->d_skip >> 1;
2508
2509 while(height--) {
2510 DUFFS_LOOP4({
2511 Uint32 s = *srcp;
2512 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2513 /* FIXME: Here we special-case opaque alpha since the
2514 compositioning used (>>8 instead of /255) doesn't handle
2515 it correctly. Also special-case alpha=0 for speed?
2516 Benchmark this! */
2517 if(alpha) {
2518 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2519 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
2520 } else {
2521 Uint32 d = *dstp;
2522 /*
2523 * convert source and destination to G0RAB65565
2524 * and blend all components at the same time
2525 */
2526 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2527 + (s >> 3 & 0x1f);
2528 d = (d | d << 16) & 0x07e0f81f;
2529 d += (s - d) * alpha >> 5;
2530 d &= 0x07e0f81f;
2531 *dstp = (Uint16)(d | d >> 16);
2532 }
2533 }
2534 srcp++;
2535 dstp++;
2536 }, width);
2537 srcp += srcskip;
2538 dstp += dstskip;
2539 }
2540}
2541
2542/* fast ARGB8888->RGB555 blending with pixel alpha */
2543static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2544{
2545 int width = info->d_width;
2546 int height = info->d_height;
2547 Uint32 *srcp = (Uint32 *)info->s_pixels;
2548 int srcskip = info->s_skip >> 2;
2549 Uint16 *dstp = (Uint16 *)info->d_pixels;
2550 int dstskip = info->d_skip >> 1;
2551
2552 while(height--) {
2553 DUFFS_LOOP4({
2554 unsigned alpha;
2555 Uint32 s = *srcp;
2556 alpha = s >> 27; /* downscale alpha to 5 bits */
2557 /* FIXME: Here we special-case opaque alpha since the
2558 compositioning used (>>8 instead of /255) doesn't handle
2559 it correctly. Also special-case alpha=0 for speed?
2560 Benchmark this! */
2561 if(alpha) {
2562 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2563 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
2564 } else {
2565 Uint32 d = *dstp;
2566 /*
2567 * convert source and destination to G0RAB65565
2568 * and blend all components at the same time
2569 */
2570 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2571 + (s >> 3 & 0x1f);
2572 d = (d | d << 16) & 0x03e07c1f;
2573 d += (s - d) * alpha >> 5;
2574 d &= 0x03e07c1f;
2575 *dstp = (Uint16)(d | d >> 16);
2576 }
2577 }
2578 srcp++;
2579 dstp++;
2580 }, width);
2581 srcp += srcskip;
2582 dstp += dstskip;
2583 }
2584}
2585
2586/* General (slow) N->N blending with per-surface alpha */
2587static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2588{
2589 int width = info->d_width;
2590 int height = info->d_height;
2591 Uint8 *src = info->s_pixels;
2592 int srcskip = info->s_skip;
2593 Uint8 *dst = info->d_pixels;
2594 int dstskip = info->d_skip;
2595 SDL_PixelFormat *srcfmt = info->src;
2596 SDL_PixelFormat *dstfmt = info->dst;
2597 int srcbpp = srcfmt->BytesPerPixel;
2598 int dstbpp = dstfmt->BytesPerPixel;
2599 unsigned sA = srcfmt->alpha;
2600 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2601
2602 if(sA) {
2603 while ( height-- ) {
2604 DUFFS_LOOP4(
2605 {
2606 Uint32 Pixel;
2607 unsigned sR;
2608 unsigned sG;
2609 unsigned sB;
2610 unsigned dR;
2611 unsigned dG;
2612 unsigned dB;
2613 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2614 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2615 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2616 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2617 src += srcbpp;
2618 dst += dstbpp;
2619 },
2620 width);
2621 src += srcskip;
2622 dst += dstskip;
2623 }
2624 }
2625}
2626
2627/* General (slow) colorkeyed N->N blending with per-surface alpha */
2628static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2629{
2630 int width = info->d_width;
2631 int height = info->d_height;
2632 Uint8 *src = info->s_pixels;
2633 int srcskip = info->s_skip;
2634 Uint8 *dst = info->d_pixels;
2635 int dstskip = info->d_skip;
2636 SDL_PixelFormat *srcfmt = info->src;
2637 SDL_PixelFormat *dstfmt = info->dst;
2638 Uint32 ckey = srcfmt->colorkey;
2639 int srcbpp = srcfmt->BytesPerPixel;
2640 int dstbpp = dstfmt->BytesPerPixel;
2641 unsigned sA = srcfmt->alpha;
2642 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2643
211e4bff 2644 if (srcbpp == 2 && srcfmt->Gmask == 0x7e0 && dstbpp == 2 && dstfmt->Gmask == 0x7e0) {
2645 Uint16 *src16 = (Uint16 *)src;
2646 Uint16 *dst16 = (Uint16 *)dst;
2647 sA >>= 3; /* downscale alpha to 5 bits */
2648 while ( height-- ) {
2649 DUFFS_LOOP4(
2650 {
2651 Uint32 s;
2652 Uint32 d;
2653 s = *src16;
2654 if(sA && s != ckey) {
2655 d = *dst16;
2656 s = (s | s << 16) & 0x07e0f81f;
2657 d = (d | d << 16) & 0x07e0f81f;
2658 d += (s - d) * sA >> 5;
2659 d &= 0x07e0f81f;
2660 *dst16 = (Uint16)(d | d >> 16);
2661 }
2662 src16++;
2663 dst16++;
2664 },
2665 width);
2666 src16 += srcskip / 2;
2667 dst16 += dstskip / 2;
2668 }
2669 return;
2670 }
2671
e14743d1 2672 while ( height-- ) {
2673 DUFFS_LOOP4(
2674 {
2675 Uint32 Pixel;
2676 unsigned sR;
2677 unsigned sG;
2678 unsigned sB;
2679 unsigned dR;
2680 unsigned dG;
2681 unsigned dB;
2682 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2683 if(sA && Pixel != ckey) {
2684 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2685 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2686 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2687 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2688 }
2689 src += srcbpp;
2690 dst += dstbpp;
2691 },
2692 width);
2693 src += srcskip;
2694 dst += dstskip;
2695 }
2696}
2697
2698/* General (slow) N->N blending with pixel alpha */
2699static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2700{
2701 int width = info->d_width;
2702 int height = info->d_height;
2703 Uint8 *src = info->s_pixels;
2704 int srcskip = info->s_skip;
2705 Uint8 *dst = info->d_pixels;
2706 int dstskip = info->d_skip;
2707 SDL_PixelFormat *srcfmt = info->src;
2708 SDL_PixelFormat *dstfmt = info->dst;
2709
2710 int srcbpp;
2711 int dstbpp;
2712
2713 /* Set up some basic variables */
2714 srcbpp = srcfmt->BytesPerPixel;
2715 dstbpp = dstfmt->BytesPerPixel;
2716
2717 /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2718 quite right. for <8bpp source alpha, it gets them very wrong
2719 (check all macros!)
2720 It is unclear whether there is a good general solution that doesn't
2721 need a branch (or a divide). */
2722 while ( height-- ) {
2723 DUFFS_LOOP4(
2724 {
2725 Uint32 Pixel;
2726 unsigned sR;
2727 unsigned sG;
2728 unsigned sB;
2729 unsigned dR;
2730 unsigned dG;
2731 unsigned dB;
2732 unsigned sA;
2733 unsigned dA;
2734 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2735 if(sA) {
2736 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2737 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2738 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2739 }
2740 src += srcbpp;
2741 dst += dstbpp;
2742 },
2743 width);
2744 src += srcskip;
2745 dst += dstskip;
2746 }
2747}
2748
2749
2750SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2751{
2752 SDL_PixelFormat *sf = surface->format;
2753 SDL_PixelFormat *df = surface->map->dst->format;
2754
2755 if(sf->Amask == 0) {
2756 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2757 if(df->BytesPerPixel == 1)
2758 return BlitNto1SurfaceAlphaKey;
2759 else
2760#if SDL_ALTIVEC_BLITTERS
2761 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2762 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2763 return Blit32to32SurfaceAlphaKeyAltivec;
2764 else
2765#endif
2766 return BlitNtoNSurfaceAlphaKey;
2767 } else {
2768 /* Per-surface alpha blits */
2769 switch(df->BytesPerPixel) {
2770 case 1:
2771 return BlitNto1SurfaceAlpha;
2772
2773 case 2:
2774 if(surface->map->identity) {
2775 if(df->Gmask == 0x7e0)
2776 {
2777#if MMX_ASMBLIT
2778 if(SDL_HasMMX())
2779 return Blit565to565SurfaceAlphaMMX;
2780 else
2781#endif
2782 return Blit565to565SurfaceAlpha;
2783 }
2784 else if(df->Gmask == 0x3e0)
2785 {
2786#if MMX_ASMBLIT
2787 if(SDL_HasMMX())
2788 return Blit555to555SurfaceAlphaMMX;
2789 else
2790#endif
2791 return Blit555to555SurfaceAlpha;
2792 }
2793 }
2794 return BlitNtoNSurfaceAlpha;
2795
2796 case 4:
2797 if(sf->Rmask == df->Rmask
2798 && sf->Gmask == df->Gmask
2799 && sf->Bmask == df->Bmask
2800 && sf->BytesPerPixel == 4)
2801 {
2802#if MMX_ASMBLIT
2803 if(sf->Rshift % 8 == 0
2804 && sf->Gshift % 8 == 0
2805 && sf->Bshift % 8 == 0
2806 && SDL_HasMMX())
2807 return BlitRGBtoRGBSurfaceAlphaMMX;
2808#endif
2809 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2810 {
2811#if SDL_ALTIVEC_BLITTERS
2812 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2813 && SDL_HasAltiVec())
2814 return BlitRGBtoRGBSurfaceAlphaAltivec;
2815#endif
2816 return BlitRGBtoRGBSurfaceAlpha;
2817 }
2818 }
2819#if SDL_ALTIVEC_BLITTERS
2820 if((sf->BytesPerPixel == 4) &&
2821 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2822 return Blit32to32SurfaceAlphaAltivec;
2823 else
2824#endif
2825 return BlitNtoNSurfaceAlpha;
2826
2827 case 3:
2828 default:
2829 return BlitNtoNSurfaceAlpha;
2830 }
2831 }
2832 } else {
2833 /* Per-pixel alpha blits */
2834 switch(df->BytesPerPixel) {
2835 case 1:
2836 return BlitNto1PixelAlpha;
2837
2838 case 2:
2839#if SDL_ALTIVEC_BLITTERS
2840 if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2841 df->Gmask == 0x7e0 &&
2842 df->Bmask == 0x1f && SDL_HasAltiVec())
2843 return Blit32to565PixelAlphaAltivec;
2844 else
2845#endif
2846 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2847 && sf->Gmask == 0xff00
2848 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2849 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2850 if(df->Gmask == 0x7e0)
2851 return BlitARGBto565PixelAlpha;
2852 else if(df->Gmask == 0x3e0)
2853 return BlitARGBto555PixelAlpha;
2854 }
2855 return BlitNtoNPixelAlpha;
2856
2857 case 4:
2858 if(sf->Rmask == df->Rmask
2859 && sf->Gmask == df->Gmask
2860 && sf->Bmask == df->Bmask
2861 && sf->BytesPerPixel == 4)
2862 {
2863#if MMX_ASMBLIT
2864 if(sf->Rshift % 8 == 0
2865 && sf->Gshift % 8 == 0
2866 && sf->Bshift % 8 == 0
2867 && sf->Ashift % 8 == 0
2868 && sf->Aloss == 0)
2869 {
2870 if(SDL_Has3DNow())
2871 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2872 if(SDL_HasMMX())
2873 return BlitRGBtoRGBPixelAlphaMMX;
2874 }
2875#endif
2876 if(sf->Amask == 0xff000000)
2877 {
2878#if SDL_ALTIVEC_BLITTERS
2879 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2880 && SDL_HasAltiVec())
2881 return BlitRGBtoRGBPixelAlphaAltivec;
2882#endif
2883 return BlitRGBtoRGBPixelAlpha;
2884 }
2885 }
2886#if SDL_ALTIVEC_BLITTERS
2887 if (sf->Amask && sf->BytesPerPixel == 4 &&
2888 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2889 return Blit32to32PixelAlphaAltivec;
2890 else
2891#endif
2892 return BlitNtoNPixelAlpha;
2893
2894 case 3:
2895 default:
2896 return BlitNtoNPixelAlpha;
2897 }
2898 }
2899}
2900