src/video/SDL_blit_A.c

   1 /*
   2     SDL - Simple DirectMedia Layer
   3     Copyright (C) 1997-2009 Sam Lantinga
   4
   5     This library is free software; you can redistribute it and/or
   6     modify it under the terms of the GNU Lesser General Public
   7     License as published by the Free Software Foundation; either
   8     version 2.1 of the License, or (at your option) any later version.
   9
  10     This library is distributed in the hope that it will be useful,
  11     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13     Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with this library; if not, write to the Free Software
  17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19     Sam Lantinga
  20     slouken@libsdl.org
  21 */
  22 #include "SDL_config.h"
  23
  24 #include "SDL_video.h"
  25 #include "SDL_blit.h"
  26
  27 /*
  28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
  29    Checking if _mm_free is #defined in malloc.h is is the only way to
  30    determine if the Processor Pack is installed, as far as I can tell.
  31 */
  32
  33 #if SDL_ASSEMBLY_ROUTINES
  34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  35 #    define MMX_ASMBLIT 1
  36 #    define GCC_ASMBLIT 1
  37 #  elif defined(_MSC_VER) && defined(_M_IX86)
  38 #    if (_MSC_VER <= 1200)
  39 #      include <malloc.h>
  40 #      if defined(_mm_free)
  41 #          define HAVE_MMINTRIN_H 1
  42 #      endif
  43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
  44 #      define HAVE_MMINTRIN_H 1
  45 #    endif
  46 #    if HAVE_MMINTRIN_H
  47 #      define MMX_ASMBLIT 1
  48 #      define MSVC_ASMBLIT 1
  49 #    endif
  50 #  endif
  51 #endif /* SDL_ASSEMBLY_ROUTINES */
  52
  53 /* Function to check the CPU flags */
  54 #include "SDL_cpuinfo.h"
  55 #if GCC_ASMBLIT
  56 #include "mmx.h"
  57 #elif MSVC_ASMBLIT
  58 #include <mmintrin.h>
  59 #include <mm3dnow.h>
  60 #endif
  61
  62 /* Functions to perform alpha blended blitting */
  63
  64 /* N->1 blending with per-surface alpha */
  65 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
  66 {
  67         int width = info->d_width;
  68         int height = info->d_height;
  69         Uint8 *src = info->s_pixels;
  70         int srcskip = info->s_skip;
  71         Uint8 *dst = info->d_pixels;
  72         int dstskip = info->d_skip;
  73         Uint8 *palmap = info->table;
  74         SDL_PixelFormat *srcfmt = info->src;
  75         SDL_PixelFormat *dstfmt = info->dst;
  76         int srcbpp = srcfmt->BytesPerPixel;
  77
  78         const unsigned A = srcfmt->alpha;
  79
  80         while ( height-- ) {
  81             DUFFS_LOOP4(
  82             {
  83                 Uint32 Pixel;
  84                 unsigned sR;
  85                 unsigned sG;
  86                 unsigned sB;
  87                 unsigned dR;
  88                 unsigned dG;
  89                 unsigned dB;
  90                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  91                 dR = dstfmt->palette->colors[*dst].r;
  92                 dG = dstfmt->palette->colors[*dst].g;
  93                 dB = dstfmt->palette->colors[*dst].b;
  94                 ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
  95                 dR &= 0xff;
  96                 dG &= 0xff;
  97                 dB &= 0xff;
  98                 /* Pack RGB into 8bit pixel */
  99                 if ( palmap == NULL ) {
 100                     *dst =((dR>>5)<<(3+2))|
 101                           ((dG>>5)<<(2))|
 102                           ((dB>>6)<<(0));
 103                 } else {
 104                     *dst = palmap[((dR>>5)<<(3+2))|
 105                                   ((dG>>5)<<(2))  |
 106                                   ((dB>>6)<<(0))];
 107                 }
 108                 dst++;
 109                 src += srcbpp;
 110             },
 111             width);
 112             src += srcskip;
 113             dst += dstskip;
 114         }
 115 }
 116
 117 /* N->1 blending with pixel alpha */
 118 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
 119 {
 120         int width = info->d_width;
 121         int height = info->d_height;
 122         Uint8 *src = info->s_pixels;
 123         int srcskip = info->s_skip;
 124         Uint8 *dst = info->d_pixels;
 125         int dstskip = info->d_skip;
 126         Uint8 *palmap = info->table;
 127         SDL_PixelFormat *srcfmt = info->src;
 128         SDL_PixelFormat *dstfmt = info->dst;
 129         int srcbpp = srcfmt->BytesPerPixel;
 130
 131         /* FIXME: fix alpha bit field expansion here too? */
 132         while ( height-- ) {
 133             DUFFS_LOOP4(
 134             {
 135                 Uint32 Pixel;
 136                 unsigned sR;
 137                 unsigned sG;
 138                 unsigned sB;
 139                 unsigned sA;
 140                 unsigned dR;
 141                 unsigned dG;
 142                 unsigned dB;
 143                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
 144                 dR = dstfmt->palette->colors[*dst].r;
 145                 dG = dstfmt->palette->colors[*dst].g;
 146                 dB = dstfmt->palette->colors[*dst].b;
 147                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
 148                 dR &= 0xff;
 149                 dG &= 0xff;
 150                 dB &= 0xff;
 151                 /* Pack RGB into 8bit pixel */
 152                 if ( palmap == NULL ) {
 153                     *dst =((dR>>5)<<(3+2))|
 154                           ((dG>>5)<<(2))|
 155                           ((dB>>6)<<(0));
 156                 } else {
 157                     *dst = palmap[((dR>>5)<<(3+2))|
 158                                   ((dG>>5)<<(2))  |
 159                                   ((dB>>6)<<(0))  ];
 160                 }
 161                 dst++;
 162                 src += srcbpp;
 163             },
 164             width);
 165             src += srcskip;
 166             dst += dstskip;
 167         }
 168 }
 169
 170 /* colorkeyed N->1 blending with per-surface alpha */
 171 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
 172 {
 173         int width = info->d_width;
 174         int height = info->d_height;
 175         Uint8 *src = info->s_pixels;
 176         int srcskip = info->s_skip;
 177         Uint8 *dst = info->d_pixels;
 178         int dstskip = info->d_skip;
 179         Uint8 *palmap = info->table;
 180         SDL_PixelFormat *srcfmt = info->src;
 181         SDL_PixelFormat *dstfmt = info->dst;
 182         int srcbpp = srcfmt->BytesPerPixel;
 183         Uint32 ckey = srcfmt->colorkey;
 184
 185         const int A = srcfmt->alpha;
 186
 187         while ( height-- ) {
 188             DUFFS_LOOP(
 189             {
 190                 Uint32 Pixel;
 191                 unsigned sR;
 192                 unsigned sG;
 193                 unsigned sB;
 194                 unsigned dR;
 195                 unsigned dG;
 196                 unsigned dB;
 197                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
 198                 if ( Pixel != ckey ) {
 199                     dR = dstfmt->palette->colors[*dst].r;
 200                     dG = dstfmt->palette->colors[*dst].g;
 201                     dB = dstfmt->palette->colors[*dst].b;
 202                     ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
 203                     dR &= 0xff;
 204                     dG &= 0xff;
 205                     dB &= 0xff;
 206                     /* Pack RGB into 8bit pixel */
 207                     if ( palmap == NULL ) {
 208                         *dst =((dR>>5)<<(3+2))|
 209                               ((dG>>5)<<(2)) |
 210                               ((dB>>6)<<(0));
 211                     } else {
 212                         *dst = palmap[((dR>>5)<<(3+2))|
 213                                       ((dG>>5)<<(2))  |
 214                                       ((dB>>6)<<(0))  ];
 215                     }
 216                 }
 217                 dst++;
 218                 src += srcbpp;
 219             },
 220             width);
 221             src += srcskip;
 222             dst += dstskip;
 223         }
 224 }
 225
 226 #if GCC_ASMBLIT
 227 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 228 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
 229 {
 230         int width = info->d_width;
 231         int height = info->d_height;
 232         Uint32 *srcp = (Uint32 *)info->s_pixels;
 233         int srcskip = info->s_skip >> 2;
 234         Uint32 *dstp = (Uint32 *)info->d_pixels;
 235         int dstskip = info->d_skip >> 2;
 236         Uint32 dalpha = info->dst->Amask;
 237         Uint64 load;
 238
 239         load = 0x00fefefe00fefefeULL;/* alpha128 mask */
 240         movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
 241         load = 0x0001010100010101ULL;/* !alpha128 mask */
 242         movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
 243         movd_m2r(dalpha, mm7); /* dst alpha mask */
 244         punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
 245         while(height--) {
 246                 DUFFS_LOOP_DOUBLE2(
 247                 {
 248                         Uint32 s = *srcp++;
 249                         Uint32 d = *dstp;
 250                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
 251                                    + (s & d & 0x00010101)) | dalpha;
 252                 },{
 253                         movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
 254                         movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
 255
 256                         movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
 257                         movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
 258
 259                         pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
 260                         pand_r2r(mm4, mm5); /* src & mask -> mm5 */
 261                         paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
 262                         pand_r2r(mm1, mm2); /* src & dst -> mm2 */
 263                         psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
 264                         pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
 265                         paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
 266
 267                         por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
 268                         movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
 269                         dstp += 2;
 270                         srcp += 2;
 271                 }, width);
 272                 srcp += srcskip;
 273                 dstp += dstskip;
 274         }
 275         emms();
 276 }
 277
 278 /* fast RGB888->(A)RGB888 blending with surface alpha */
 279 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 280 {
 281         SDL_PixelFormat* df = info->dst;
 282         unsigned alpha = info->src->alpha;
 283
 284         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
 285                         /* only call a128 version when R,G,B occupy lower bits */
 286                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
 287         } else {
 288                 int width = info->d_width;
 289                 int height = info->d_height;
 290                 Uint32 *srcp = (Uint32 *)info->s_pixels;
 291                 int srcskip = info->s_skip >> 2;
 292                 Uint32 *dstp = (Uint32 *)info->d_pixels;
 293                 int dstskip = info->d_skip >> 2;
 294
 295                 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
 296                 /* form the alpha mult */
 297                 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
 298                 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
 299                 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
 300                 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
 301                 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
 302                 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
 303                 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
 304                         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
 305                 movd_m2r(df->Amask, mm7); /* dst alpha mask */
 306                 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
 307
 308                 while(height--) {
 309                         DUFFS_LOOP_DOUBLE2({
 310                                 /* One Pixel Blend */
 311                                 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
 312                                 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
 313                                 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
 314                                 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
 315
 316                                 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
 317                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
 318                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
 319                                 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
 320
 321                                 packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
 322                                 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
 323                                 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
 324                                 ++srcp;
 325                                 ++dstp;
 326                         },{
 327                                 /* Two Pixels Blend */
 328                                 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
 329                                 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
 330                                 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
 331                                 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
 332
 333                                 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
 334                                 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
 335                                 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
 336                                 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
 337
 338                                 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
 339                                 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
 340                                 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
 341                                 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
 342
 343                                 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
 344                                 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
 345                                 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
 346                                 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
 347
 348                                 packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
 349                                 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
 350
 351                                 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
 352
 353                                 srcp += 2;
 354                                 dstp += 2;
 355                         }, width);
 356                         srcp += srcskip;
 357                         dstp += dstskip;
 358                 }
 359                 emms();
 360         }
 361 }
 362
 363 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
 364 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
 365 {
 366         int width = info->d_width;
 367         int height = info->d_height;
 368         Uint32 *srcp = (Uint32 *)info->s_pixels;
 369         int srcskip = info->s_skip >> 2;
 370         Uint32 *dstp = (Uint32 *)info->d_pixels;
 371         int dstskip = info->d_skip >> 2;
 372         SDL_PixelFormat* sf = info->src;
 373         Uint32 amask = sf->Amask;
 374
 375         pxor_r2r(mm6, mm6); /* 0 -> mm6 */
 376         /* form multiplication mask */
 377         movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
 378         punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
 379         pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
 380         movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
 381         pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
 382         /* form channel masks */
 383         movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
 384         packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
 385         packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
 386         pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
 387         /* get alpha channel shift */
 388         __asm__ __volatile__ (
 389                 "movd %0, %%mm5"
 390                 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
 391
 392         while(height--) {
 393             DUFFS_LOOP4({
 394                 Uint32 alpha = *srcp & amask;
 395                 /* FIXME: Here we special-case opaque alpha since the
 396                         compositioning used (>>8 instead of /255) doesn't handle
 397                         it correctly. Also special-case alpha=0 for speed?
 398                         Benchmark this! */
 399                 if(alpha == 0) {
 400                         /* do nothing */
 401                 } else if(alpha == amask) {
 402                         /* opaque alpha -- copy RGB, keep dst alpha */
 403                         /* using MMX here to free up regular registers for other things */
 404                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
 405                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
 406                         pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
 407                         pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
 408                         por_r2r(mm1, mm2); /* src | dst -> mm2 */
 409                         movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
 410                 } else {
 411                         movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
 412                         punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
 413
 414                         movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
 415                         punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
 416
 417                         __asm__ __volatile__ (
 418                                 "movd %0, %%mm4"
 419                                 : : "r" (alpha) ); /* 0000A000 -> mm4 */
 420                         psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
 421                         punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
 422                         punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
 423                         pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
 424
 425                         /* blend */
 426                         psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
 427                         pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
 428                         psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
 429                         paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
 430
 431                         packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
 432                         movd_r2m(mm2, *dstp);/* mm2 -> dst */
 433                 }
 434                 ++srcp;
 435                 ++dstp;
 436             }, width);
 437             srcp += srcskip;
 438             dstp += dstskip;
 439         }
 440         emms();
 441 }
 442 /* End GCC_ASMBLIT */
 443
 444 #elif MSVC_ASMBLIT
 445 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 446 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
 447 {
 448         int width = info->d_width;
 449         int height = info->d_height;
 450         Uint32 *srcp = (Uint32 *)info->s_pixels;
 451         int srcskip = info->s_skip >> 2;
 452         Uint32 *dstp = (Uint32 *)info->d_pixels;
 453         int dstskip = info->d_skip >> 2;
 454         Uint32 dalpha = info->dst->Amask;
 455
 456         __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
 457
 458         hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
 459         lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
 460         dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
 461
 462         while (height--) {
 463                 int n = width;
 464                 if ( n & 1 ) {
 465                         Uint32 s = *srcp++;
 466                         Uint32 d = *dstp;
 467                         *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
 468                                    + (s & d & 0x00010101)) | dalpha;
 469                         n--;
 470                 }
 471
 472                 for (n >>= 1; n > 0; --n) {
 473                         dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
 474                         dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
 475
 476                         src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
 477                         src2 = src1; /* 2 x src -> src2(ARGBARGB) */
 478
 479                         dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
 480                         src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
 481                         src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
 482                         src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
 483
 484                         dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
 485                         dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
 486                         dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
 487                         dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
 488
 489                         *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
 490                         dstp += 2;
 491                         srcp += 2;
 492                 }
 493
 494                 srcp += srcskip;
 495                 dstp += dstskip;
 496         }
 497         _mm_empty();
 498 }
 499
 500 /* fast RGB888->(A)RGB888 blending with surface alpha */
 501 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 502 {
 503         SDL_PixelFormat* df = info->dst;
 504         Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
 505         unsigned alpha = info->src->alpha;
 506
 507         if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
 508                         /* only call a128 version when R,G,B occupy lower bits */
 509                 BlitRGBtoRGBSurfaceAlpha128MMX(info);
 510         } else {
 511                 int width = info->d_width;
 512                 int height = info->d_height;
 513                 Uint32 *srcp = (Uint32 *)info->s_pixels;
 514                 int srcskip = info->s_skip >> 2;
 515                 Uint32 *dstp = (Uint32 *)info->d_pixels;
 516                 int dstskip = info->d_skip >> 2;
 517                 Uint32 dalpha = df->Amask;
 518                 Uint32 amult;
 519
 520                 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
 521
 522                 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
 523                 /* form the alpha mult */
 524                 amult = alpha | (alpha << 8);
 525                 amult = amult | (amult << 16);
 526                 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
 527                 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
 528                 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
 529                         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
 530                 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
 531
 532                 while (height--) {
 533                         int n = width;
 534                         if (n & 1) {
 535                                 /* One Pixel Blend */
 536                                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
 537                                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
 538
 539                                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
 540                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
 541
 542                                 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
 543                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
 544                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
 545                                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
 546
 547                                 dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
 548                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
 549                                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
 550
 551                                 ++srcp;
 552                                 ++dstp;
 553
 554                                 n--;
 555                         }
 556
 557                         for (n >>= 1; n > 0; --n) {
 558                                 /* Two Pixels Blend */
 559                                 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
 560                                 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
 561                                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
 562                                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
 563
 564                                 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
 565                                 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
 566                                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
 567                                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
 568
 569                                 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
 570                                 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
 571                                 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
 572                                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
 573
 574                                 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
 575                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
 576                                 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
 577                                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
 578
 579                                 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
 580                                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
 581
 582                                 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
 583
 584                                 srcp += 2;
 585                                 dstp += 2;
 586                         }
 587                         srcp += srcskip;
 588                         dstp += dstskip;
 589                 }
 590                 _mm_empty();
 591         }
 592 }
 593
 594 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
 595 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
 596 {
 597         int width = info->d_width;
 598         int height = info->d_height;
 599         Uint32 *srcp = (Uint32 *)info->s_pixels;
 600         int srcskip = info->s_skip >> 2;
 601         Uint32 *dstp = (Uint32 *)info->d_pixels;
 602         int dstskip = info->d_skip >> 2;
 603         SDL_PixelFormat* sf = info->src;
 604         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
 605         Uint32 amask = sf->Amask;
 606         Uint32 ashift = sf->Ashift;
 607         Uint64 multmask;
 608
 609         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
 610
 611         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
 612         multmask = ~(0xFFFFi64 << (ashift * 2));
 613         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
 614
 615         while(height--) {
 616                 DUFFS_LOOP4({
 617                 Uint32 alpha = *srcp & amask;
 618                 if (alpha == 0) {
 619                         /* do nothing */
 620                 } else if (alpha == amask) {
 621                         /* opaque alpha -- copy RGB, keep dst alpha */
 622                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
 623                 } else {
 624                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
 625                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
 626
 627                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
 628                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
 629
 630                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
 631                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
 632                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
 633                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
 634                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
 635
 636                         /* blend */
 637                         src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
 638                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
 639                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
 640                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
 641                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
 642
 643                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
 644                 }
 645                 ++srcp;
 646                 ++dstp;
 647             }, width);
 648             srcp += srcskip;
 649             dstp += dstskip;
 650         }
 651         _mm_empty();
 652 }
 653 /* End MSVC_ASMBLIT */
 654
 655 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
 656
 657 #if SDL_ALTIVEC_BLITTERS
 658 #if __MWERKS__
 659 #pragma altivec_model on
 660 #endif
 661 #if HAVE_ALTIVEC_H
 662 #include <altivec.h>
 663 #endif
 664 #include <assert.h>
 665
 666 #if (defined(__MACOSX__) && (__GNUC__ < 4))
 667     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
 668         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
 669     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
 670         (vector unsigned short) ( a,b,c,d,e,f,g,h )
 671 #else
 672     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
 673         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
 674     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
 675         (vector unsigned short) { a,b,c,d,e,f,g,h }
 676 #endif
 677
 678 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
 679 #define VECPRINT(msg, v) do { \
 680     vector unsigned int tmpvec = (vector unsigned int)(v); \
 681     unsigned int *vp = (unsigned int *)&tmpvec; \
 682     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
 683 } while (0)
 684
 685 /* the permuation vector that takes the high bytes out of all the appropriate shorts
 686     (vector unsigned char)(
 687         0x00, 0x10, 0x02, 0x12,
 688         0x04, 0x14, 0x06, 0x16,
 689         0x08, 0x18, 0x0A, 0x1A,
 690         0x0C, 0x1C, 0x0E, 0x1E );
 691 */
 692 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
 693 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
 694 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
 695 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
 696     ? vec_lvsl(0, src) \
 697     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
 698
 699
 700 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
 701     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
 702     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
 703     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
 704     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
 705     /* valpha2 is 255-alpha */ \
 706     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
 707     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
 708     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
 709     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
 710     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
 711     /* add source and dest */ \
 712     vtemp1 = vec_add(vtemp1, vtemp3); \
 713     vtemp2 = vec_add(vtemp2, vtemp4); \
 714     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
 715     vtemp1 = vec_add(vtemp1, v1_16); \
 716     vtemp3 = vec_sr(vtemp1, v8_16); \
 717     vtemp1 = vec_add(vtemp1, vtemp3); \
 718     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
 719     vtemp2 = vec_add(vtemp2, v1_16); \
 720     vtemp4 = vec_sr(vtemp2, v8_16); \
 721     vtemp2 = vec_add(vtemp2, vtemp4); \
 722     /* (>>8) and get ARGBARGBARGBARGB */ \
 723     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
 724 } while (0)
 725
 726 /* Calculate the permute vector used for 32->32 swizzling */
 727 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
 728                                   const SDL_PixelFormat *dstfmt)
 729 {
 730     /*
 731      * We have to assume that the bits that aren't used by other
 732      *  colors is alpha, and it's one complete byte, since some formats
 733      *  leave alpha with a zero mask, but we should still swizzle the bits.
 734      */
 735     /* ARGB */
 736     const static struct SDL_PixelFormat default_pixel_format = {
 737         NULL, 0, 0,
 738         0, 0, 0, 0,
 739         16, 8, 0, 24,
 740         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
 741         0, 0};
 742     if (!srcfmt) {
 743         srcfmt = &default_pixel_format;
 744     }
 745     if (!dstfmt) {
 746         dstfmt = &default_pixel_format;
 747     }
 748     const vector unsigned char plus = VECUINT8_LITERAL
 749                                             ( 0x00, 0x00, 0x00, 0x00,
 750                                               0x04, 0x04, 0x04, 0x04,
 751                                               0x08, 0x08, 0x08, 0x08,
 752                                               0x0C, 0x0C, 0x0C, 0x0C );
 753     vector unsigned char vswiz;
 754     vector unsigned int srcvec;
 755 #define RESHIFT(X) (3 - ((X) >> 3))
 756     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
 757     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
 758     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
 759     Uint32 amask;
 760     /* Use zero for alpha if either surface doesn't have alpha */
 761     if (dstfmt->Amask) {
 762         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
 763     } else {
 764         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
 765     }
 766 #undef RESHIFT
 767     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
 768     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
 769     return(vswiz);
 770 }
 771
 772 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
 773 {
 774     int height = info->d_height;
 775     Uint8 *src = (Uint8 *)info->s_pixels;
 776     int srcskip = info->s_skip;
 777     Uint8 *dst = (Uint8 *)info->d_pixels;
 778     int dstskip = info->d_skip;
 779     SDL_PixelFormat *srcfmt = info->src;
 780
 781     vector unsigned char v0 = vec_splat_u8(0);
 782     vector unsigned short v8_16 = vec_splat_u16(8);
 783     vector unsigned short v1_16 = vec_splat_u16(1);
 784     vector unsigned short v2_16 = vec_splat_u16(2);
 785     vector unsigned short v3_16 = vec_splat_u16(3);
 786     vector unsigned int v8_32 = vec_splat_u32(8);
 787     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
 788     vector unsigned short v3f = VECUINT16_LITERAL(
 789         0x003f, 0x003f, 0x003f, 0x003f,
 790         0x003f, 0x003f, 0x003f, 0x003f);
 791     vector unsigned short vfc = VECUINT16_LITERAL(
 792         0x00fc, 0x00fc, 0x00fc, 0x00fc,
 793         0x00fc, 0x00fc, 0x00fc, 0x00fc);
 794
 795     /*
 796         0x10 - 0x1f is the alpha
 797         0x00 - 0x0e evens are the red
 798         0x01 - 0x0f odds are zero
 799     */
 800     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
 801         0x10, 0x00, 0x01, 0x01,
 802         0x10, 0x02, 0x01, 0x01,
 803         0x10, 0x04, 0x01, 0x01,
 804         0x10, 0x06, 0x01, 0x01
 805     );
 806     vector unsigned char vredalpha2 = (vector unsigned char)(
 807         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
 808     );
 809     /*
 810         0x00 - 0x0f is ARxx ARxx ARxx ARxx
 811         0x11 - 0x0f odds are blue
 812     */
 813     vector unsigned char vblue1 = VECUINT8_LITERAL(
 814         0x00, 0x01, 0x02, 0x11,
 815         0x04, 0x05, 0x06, 0x13,
 816         0x08, 0x09, 0x0a, 0x15,
 817         0x0c, 0x0d, 0x0e, 0x17
 818     );
 819     vector unsigned char vblue2 = (vector unsigned char)(
 820         vec_add((vector unsigned int)vblue1, v8_32)
 821     );
 822     /*
 823         0x00 - 0x0f is ARxB ARxB ARxB ARxB
 824         0x10 - 0x0e evens are green
 825     */
 826     vector unsigned char vgreen1 = VECUINT8_LITERAL(
 827         0x00, 0x01, 0x10, 0x03,
 828         0x04, 0x05, 0x12, 0x07,
 829         0x08, 0x09, 0x14, 0x0b,
 830         0x0c, 0x0d, 0x16, 0x0f
 831     );
 832     vector unsigned char vgreen2 = (vector unsigned char)(
 833         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
 834     );
 835     vector unsigned char vgmerge = VECUINT8_LITERAL(
 836         0x00, 0x02, 0x00, 0x06,
 837         0x00, 0x0a, 0x00, 0x0e,
 838         0x00, 0x12, 0x00, 0x16,
 839         0x00, 0x1a, 0x00, 0x1e);
 840     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
 841     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
 842     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
 843
 844     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
 845     vf800 = vec_sl(vf800, vec_splat_u16(8));
 846
 847     while(height--) {
 848         int extrawidth;
 849         vector unsigned char valigner;
 850         vector unsigned char vsrc;
 851         vector unsigned char voverflow;
 852         int width = info->d_width;
 853
 854 #define ONE_PIXEL_BLEND(condition, widthvar) \
 855         while (condition) { \
 856             Uint32 Pixel; \
 857             unsigned sR, sG, sB, dR, dG, dB, sA; \
 858             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
 859             if(sA) { \
 860                 unsigned short dstpixel = *((unsigned short *)dst); \
 861                 dR = (dstpixel >> 8) & 0xf8; \
 862                 dG = (dstpixel >> 3) & 0xfc; \
 863                 dB = (dstpixel << 3) & 0xf8; \
 864                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
 865                 *((unsigned short *)dst) = ( \
 866                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
 867                 ); \
 868             } \
 869             src += 4; \
 870             dst += 2; \
 871             widthvar--; \
 872         }
 873         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
 874         extrawidth = (width % 8);
 875         valigner = VEC_ALIGNER(src);
 876         vsrc = (vector unsigned char)vec_ld(0, src);
 877         width -= extrawidth;
 878         while (width) {
 879             vector unsigned char valpha;
 880             vector unsigned char vsrc1, vsrc2;
 881             vector unsigned char vdst1, vdst2;
 882             vector unsigned short vR, vG, vB;
 883             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
 884
 885             /* Load 8 pixels from src as ARGB */
 886             voverflow = (vector unsigned char)vec_ld(15, src);
 887             vsrc = vec_perm(vsrc, voverflow, valigner);
 888             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
 889             src += 16;
 890             vsrc = (vector unsigned char)vec_ld(15, src);
 891             voverflow = vec_perm(voverflow, vsrc, valigner);
 892             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
 893             src += 16;
 894
 895             /* Load 8 pixels from dst as XRGB */
 896             voverflow = vec_ld(0, dst);
 897             vR = vec_and((vector unsigned short)voverflow, vf800);
 898             vB = vec_sl((vector unsigned short)voverflow, v3_16);
 899             vG = vec_sl(vB, v2_16);
 900             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
 901             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
 902             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
 903             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
 904             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
 905             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
 906
 907             /* Alpha blend 8 pixels as ARGB */
 908             valpha = vec_perm(vsrc1, v0, valphaPermute);
 909             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
 910             valpha = vec_perm(vsrc2, v0, valphaPermute);
 911             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
 912
 913             /* Convert 8 pixels to 565 */
 914             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
 915             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
 916             vgpixel = vec_and(vgpixel, vfc);
 917             vgpixel = vec_sl(vgpixel, v3_16);
 918             vrpixel = vec_sl(vpixel, v1_16);
 919             vrpixel = vec_and(vrpixel, vf800);
 920             vbpixel = vec_and(vpixel, v3f);
 921             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
 922             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
 923
 924             /* Store 8 pixels */
 925             vec_st(vdst1, 0, dst);
 926
 927             width -= 8;
 928             dst += 16;
 929         }
 930         ONE_PIXEL_BLEND((extrawidth), extrawidth);
 931 #undef ONE_PIXEL_BLEND
 932         src += srcskip;
 933         dst += dstskip;
 934     }
 935 }
 936
 937 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
 938 {
 939     unsigned alpha = info->src->alpha;
 940     int height = info->d_height;
 941     Uint32 *srcp = (Uint32 *)info->s_pixels;
 942     int srcskip = info->s_skip >> 2;
 943     Uint32 *dstp = (Uint32 *)info->d_pixels;
 944     int dstskip = info->d_skip >> 2;
 945     SDL_PixelFormat *srcfmt = info->src;
 946     SDL_PixelFormat *dstfmt = info->dst;
 947     unsigned sA = srcfmt->alpha;
 948     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
 949     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
 950     Uint32 ckey = info->src->colorkey;
 951     vector unsigned char mergePermute;
 952     vector unsigned char vsrcPermute;
 953     vector unsigned char vdstPermute;
 954     vector unsigned char vsdstPermute;
 955     vector unsigned char valpha;
 956     vector unsigned char valphamask;
 957     vector unsigned char vbits;
 958     vector unsigned char v0;
 959     vector unsigned short v1;
 960     vector unsigned short v8;
 961     vector unsigned int vckey;
 962     vector unsigned int vrgbmask;
 963
 964     mergePermute = VEC_MERGE_PERMUTE();
 965     v0 = vec_splat_u8(0);
 966     v1 = vec_splat_u16(1);
 967     v8 = vec_splat_u16(8);
 968
 969     /* set the alpha to 255 on the destination surf */
 970     valphamask = VEC_ALPHA_MASK();
 971
 972     vsrcPermute = calc_swizzle32(srcfmt, NULL);
 973     vdstPermute = calc_swizzle32(NULL, dstfmt);
 974     vsdstPermute = calc_swizzle32(dstfmt, NULL);
 975
 976     /* set a vector full of alpha and 255-alpha */
 977     ((unsigned char *)&valpha)[0] = alpha;
 978     valpha = vec_splat(valpha, 0);
 979     vbits = (vector unsigned char)vec_splat_s8(-1);
 980
 981     ckey &= rgbmask;
 982     ((unsigned int *)(char*)&vckey)[0] = ckey;
 983     vckey = vec_splat(vckey, 0);
 984     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
 985     vrgbmask = vec_splat(vrgbmask, 0);
 986
 987     while(height--) {
 988         int width = info->d_width;
 989 #define ONE_PIXEL_BLEND(condition, widthvar) \
 990         while (condition) { \
 991             Uint32 Pixel; \
 992             unsigned sR, sG, sB, dR, dG, dB; \
 993             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
 994             if(sA && Pixel != ckey) { \
 995                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
 996                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
 997                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
 998                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
 999             } \
1000             dstp++; \
1001             srcp++; \
1002             widthvar--; \
1003         }
1004         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1005         if (width > 0) {
1006             int extrawidth = (width % 4);
1007             vector unsigned char valigner = VEC_ALIGNER(srcp);
1008             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1009             width -= extrawidth;
1010             while (width) {
1011                 vector unsigned char vsel;
1012                 vector unsigned char voverflow;
1013                 vector unsigned char vd;
1014                 vector unsigned char vd_orig;
1015
1016                 /* s = *srcp */
1017                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1018                 vs = vec_perm(vs, voverflow, valigner);
1019
1020                 /* vsel is set for items that match the key */
1021                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1022                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1023
1024                 /* permute to source format */
1025                 vs = vec_perm(vs, valpha, vsrcPermute);
1026
1027                 /* d = *dstp */
1028                 vd = (vector unsigned char)vec_ld(0, dstp);
1029                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1030
1031                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1032
1033                 /* set the alpha channel to full on */
1034                 vd = vec_or(vd, valphamask);
1035
1036                 /* mask out color key */
1037                 vd = vec_sel(vd, vd_orig, vsel);
1038
1039                 /* permute to dest format */
1040                 vd = vec_perm(vd, vbits, vdstPermute);
1041
1042                 /* *dstp = res */
1043                 vec_st((vector unsigned int)vd, 0, dstp);
1044
1045                 srcp += 4;
1046                 dstp += 4;
1047                 width -= 4;
1048                 vs = voverflow;
1049             }
1050             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1051         }
1052 #undef ONE_PIXEL_BLEND
1053
1054         srcp += srcskip;
1055         dstp += dstskip;
1056     }
1057 }
1058
1059
1060 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1061 {
1062     int width = info->d_width;
1063     int height = info->d_height;
1064     Uint32 *srcp = (Uint32 *)info->s_pixels;
1065     int srcskip = info->s_skip >> 2;
1066     Uint32 *dstp = (Uint32 *)info->d_pixels;
1067     int dstskip = info->d_skip >> 2;
1068     SDL_PixelFormat *srcfmt = info->src;
1069     SDL_PixelFormat *dstfmt = info->dst;
1070     vector unsigned char mergePermute;
1071     vector unsigned char valphaPermute;
1072     vector unsigned char vsrcPermute;
1073     vector unsigned char vdstPermute;
1074     vector unsigned char vsdstPermute;
1075     vector unsigned char valphamask;
1076     vector unsigned char vpixelmask;
1077     vector unsigned char v0;
1078     vector unsigned short v1;
1079     vector unsigned short v8;
1080
1081     v0 = vec_splat_u8(0);
1082     v1 = vec_splat_u16(1);
1083     v8 = vec_splat_u16(8);
1084     mergePermute = VEC_MERGE_PERMUTE();
1085     valphamask = VEC_ALPHA_MASK();
1086     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1087     vpixelmask = vec_nor(valphamask, v0);
1088     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1089     vdstPermute = calc_swizzle32(NULL, dstfmt);
1090     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1091
1092         while ( height-- ) {
1093         width = info->d_width;
1094 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1095             Uint32 Pixel; \
1096             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1097             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1098             if(sA) { \
1099               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1100               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1101               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1102             } \
1103             ++srcp; \
1104             ++dstp; \
1105             widthvar--; \
1106         }
1107         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1108         if (width > 0) {
1109             /* vsrcPermute */
1110             /* vdstPermute */
1111             int extrawidth = (width % 4);
1112             vector unsigned char valigner = VEC_ALIGNER(srcp);
1113             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1114             width -= extrawidth;
1115             while (width) {
1116                 vector unsigned char voverflow;
1117                 vector unsigned char vd;
1118                 vector unsigned char valpha;
1119                 vector unsigned char vdstalpha;
1120                 /* s = *srcp */
1121                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1122                 vs = vec_perm(vs, voverflow, valigner);
1123                 vs = vec_perm(vs, v0, vsrcPermute);
1124
1125                 valpha = vec_perm(vs, v0, valphaPermute);
1126
1127                 /* d = *dstp */
1128                 vd = (vector unsigned char)vec_ld(0, dstp);
1129                 vd = vec_perm(vd, v0, vsdstPermute);
1130                 vdstalpha = vec_and(vd, valphamask);
1131
1132                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1133
1134                 /* set the alpha to the dest alpha */
1135                 vd = vec_and(vd, vpixelmask);
1136                 vd = vec_or(vd, vdstalpha);
1137                 vd = vec_perm(vd, v0, vdstPermute);
1138
1139                 /* *dstp = res */
1140                 vec_st((vector unsigned int)vd, 0, dstp);
1141
1142                 srcp += 4;
1143                 dstp += 4;
1144                 width -= 4;
1145                 vs = voverflow;
1146
1147             }
1148             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1149         }
1150             srcp += srcskip;
1151             dstp += dstskip;
1152 #undef ONE_PIXEL_BLEND
1153         }
1154 }
1155
1156 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1157 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1158 {
1159         int width = info->d_width;
1160         int height = info->d_height;
1161         Uint32 *srcp = (Uint32 *)info->s_pixels;
1162         int srcskip = info->s_skip >> 2;
1163         Uint32 *dstp = (Uint32 *)info->d_pixels;
1164         int dstskip = info->d_skip >> 2;
1165     vector unsigned char mergePermute;
1166     vector unsigned char valphaPermute;
1167     vector unsigned char valphamask;
1168     vector unsigned char vpixelmask;
1169     vector unsigned char v0;
1170     vector unsigned short v1;
1171     vector unsigned short v8;
1172     v0 = vec_splat_u8(0);
1173     v1 = vec_splat_u16(1);
1174     v8 = vec_splat_u16(8);
1175     mergePermute = VEC_MERGE_PERMUTE();
1176     valphamask = VEC_ALPHA_MASK();
1177     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1178
1179
1180     vpixelmask = vec_nor(valphamask, v0);
1181         while(height--) {
1182         width = info->d_width;
1183 #define ONE_PIXEL_BLEND(condition, widthvar) \
1184         while ((condition)) { \
1185             Uint32 dalpha; \
1186             Uint32 d; \
1187             Uint32 s1; \
1188             Uint32 d1; \
1189             Uint32 s = *srcp; \
1190             Uint32 alpha = s >> 24; \
1191             if(alpha) { \
1192               if(alpha == SDL_ALPHA_OPAQUE) { \
1193                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1194               } else { \
1195                 d = *dstp; \
1196                 dalpha = d & 0xff000000; \
1197                 s1 = s & 0xff00ff; \
1198                 d1 = d & 0xff00ff; \
1199                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1200                 s &= 0xff00; \
1201                 d &= 0xff00; \
1202                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1203                 *dstp = d1 | d | dalpha; \
1204               } \
1205             } \
1206             ++srcp; \
1207             ++dstp; \
1208             widthvar--; \
1209             }
1210         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1211         if (width > 0) {
1212             int extrawidth = (width % 4);
1213             vector unsigned char valigner = VEC_ALIGNER(srcp);
1214             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1215             width -= extrawidth;
1216             while (width) {
1217                 vector unsigned char voverflow;
1218                 vector unsigned char vd;
1219                 vector unsigned char valpha;
1220                 vector unsigned char vdstalpha;
1221                 /* s = *srcp */
1222                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1223                 vs = vec_perm(vs, voverflow, valigner);
1224
1225                 valpha = vec_perm(vs, v0, valphaPermute);
1226
1227                 /* d = *dstp */
1228                 vd = (vector unsigned char)vec_ld(0, dstp);
1229                 vdstalpha = vec_and(vd, valphamask);
1230
1231                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1232
1233                 /* set the alpha to the dest alpha */
1234                 vd = vec_and(vd, vpixelmask);
1235                 vd = vec_or(vd, vdstalpha);
1236
1237                 /* *dstp = res */
1238                 vec_st((vector unsigned int)vd, 0, dstp);
1239
1240                 srcp += 4;
1241                 dstp += 4;
1242                 width -= 4;
1243                 vs = voverflow;
1244             }
1245             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1246         }
1247             srcp += srcskip;
1248             dstp += dstskip;
1249         }
1250 #undef ONE_PIXEL_BLEND
1251 }
1252
1253 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1254 {
1255     /* XXX : 6 */
1256         unsigned alpha = info->src->alpha;
1257     int height = info->d_height;
1258     Uint32 *srcp = (Uint32 *)info->s_pixels;
1259     int srcskip = info->s_skip >> 2;
1260     Uint32 *dstp = (Uint32 *)info->d_pixels;
1261     int dstskip = info->d_skip >> 2;
1262     SDL_PixelFormat *srcfmt = info->src;
1263     SDL_PixelFormat *dstfmt = info->dst;
1264         unsigned sA = srcfmt->alpha;
1265         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1266     vector unsigned char mergePermute;
1267     vector unsigned char vsrcPermute;
1268     vector unsigned char vdstPermute;
1269     vector unsigned char vsdstPermute;
1270     vector unsigned char valpha;
1271     vector unsigned char valphamask;
1272     vector unsigned char vbits;
1273     vector unsigned short v1;
1274     vector unsigned short v8;
1275
1276     mergePermute = VEC_MERGE_PERMUTE();
1277     v1 = vec_splat_u16(1);
1278     v8 = vec_splat_u16(8);
1279
1280     /* set the alpha to 255 on the destination surf */
1281     valphamask = VEC_ALPHA_MASK();
1282
1283     vsrcPermute = calc_swizzle32(srcfmt, NULL);
1284     vdstPermute = calc_swizzle32(NULL, dstfmt);
1285     vsdstPermute = calc_swizzle32(dstfmt, NULL);
1286
1287     /* set a vector full of alpha and 255-alpha */
1288     ((unsigned char *)&valpha)[0] = alpha;
1289     valpha = vec_splat(valpha, 0);
1290     vbits = (vector unsigned char)vec_splat_s8(-1);
1291
1292     while(height--) {
1293         int width = info->d_width;
1294 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1295             Uint32 Pixel; \
1296             unsigned sR, sG, sB, dR, dG, dB; \
1297             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1298             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1299             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1300             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1301             ++srcp; \
1302             ++dstp; \
1303             widthvar--; \
1304         }
1305         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1306         if (width > 0) {
1307             int extrawidth = (width % 4);
1308             vector unsigned char valigner = VEC_ALIGNER(srcp);
1309             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1310             width -= extrawidth;
1311             while (width) {
1312                 vector unsigned char voverflow;
1313                 vector unsigned char vd;
1314
1315                 /* s = *srcp */
1316                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1317                 vs = vec_perm(vs, voverflow, valigner);
1318                 vs = vec_perm(vs, valpha, vsrcPermute);
1319
1320                 /* d = *dstp */
1321                 vd = (vector unsigned char)vec_ld(0, dstp);
1322                 vd = vec_perm(vd, vd, vsdstPermute);
1323
1324                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1325
1326                 /* set the alpha channel to full on */
1327                 vd = vec_or(vd, valphamask);
1328                 vd = vec_perm(vd, vbits, vdstPermute);
1329
1330                 /* *dstp = res */
1331                 vec_st((vector unsigned int)vd, 0, dstp);
1332
1333                 srcp += 4;
1334                 dstp += 4;
1335                 width -= 4;
1336                 vs = voverflow;
1337             }
1338             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1339         }
1340 #undef ONE_PIXEL_BLEND
1341
1342         srcp += srcskip;
1343         dstp += dstskip;
1344     }
1345
1346 }
1347
1348
1349 /* fast RGB888->(A)RGB888 blending */
1350 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1351 {
1352         unsigned alpha = info->src->alpha;
1353     int height = info->d_height;
1354     Uint32 *srcp = (Uint32 *)info->s_pixels;
1355     int srcskip = info->s_skip >> 2;
1356     Uint32 *dstp = (Uint32 *)info->d_pixels;
1357     int dstskip = info->d_skip >> 2;
1358     vector unsigned char mergePermute;
1359     vector unsigned char valpha;
1360     vector unsigned char valphamask;
1361     vector unsigned short v1;
1362     vector unsigned short v8;
1363
1364     mergePermute = VEC_MERGE_PERMUTE();
1365     v1 = vec_splat_u16(1);
1366     v8 = vec_splat_u16(8);
1367
1368     /* set the alpha to 255 on the destination surf */
1369     valphamask = VEC_ALPHA_MASK();
1370
1371     /* set a vector full of alpha and 255-alpha */
1372     ((unsigned char *)&valpha)[0] = alpha;
1373     valpha = vec_splat(valpha, 0);
1374
1375     while(height--) {
1376         int width = info->d_width;
1377 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1378             Uint32 s = *srcp; \
1379             Uint32 d = *dstp; \
1380             Uint32 s1 = s & 0xff00ff; \
1381             Uint32 d1 = d & 0xff00ff; \
1382             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1383                  & 0xff00ff; \
1384             s &= 0xff00; \
1385             d &= 0xff00; \
1386             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1387             *dstp = d1 | d | 0xff000000; \
1388             ++srcp; \
1389             ++dstp; \
1390             widthvar--; \
1391         }
1392         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1393         if (width > 0) {
1394             int extrawidth = (width % 4);
1395             vector unsigned char valigner = VEC_ALIGNER(srcp);
1396             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1397             width -= extrawidth;
1398             while (width) {
1399                 vector unsigned char voverflow;
1400                 vector unsigned char vd;
1401
1402                 /* s = *srcp */
1403                 voverflow = (vector unsigned char)vec_ld(15, srcp);
1404                 vs = vec_perm(vs, voverflow, valigner);
1405
1406                 /* d = *dstp */
1407                 vd = (vector unsigned char)vec_ld(0, dstp);
1408
1409                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1410
1411                 /* set the alpha channel to full on */
1412                 vd = vec_or(vd, valphamask);
1413
1414                 /* *dstp = res */
1415                 vec_st((vector unsigned int)vd, 0, dstp);
1416
1417                 srcp += 4;
1418                 dstp += 4;
1419                 width -= 4;
1420                 vs = voverflow;
1421             }
1422             ONE_PIXEL_BLEND((extrawidth), extrawidth);
1423         }
1424 #undef ONE_PIXEL_BLEND
1425
1426         srcp += srcskip;
1427         dstp += dstskip;
1428     }
1429 }
1430 #if __MWERKS__
1431 #pragma altivec_model off
1432 #endif
1433 #endif /* SDL_ALTIVEC_BLITTERS */
1434
1435 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1436 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1437 {
1438         int width = info->d_width;
1439         int height = info->d_height;
1440         Uint32 *srcp = (Uint32 *)info->s_pixels;
1441         int srcskip = info->s_skip >> 2;
1442         Uint32 *dstp = (Uint32 *)info->d_pixels;
1443         int dstskip = info->d_skip >> 2;
1444
1445         while(height--) {
1446             DUFFS_LOOP4({
1447                     Uint32 s = *srcp++;
1448                     Uint32 d = *dstp;
1449                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1450                                + (s & d & 0x00010101)) | 0xff000000;
1451             }, width);
1452             srcp += srcskip;
1453             dstp += dstskip;
1454         }
1455 }
1456
1457 /* fast RGB888->(A)RGB888 blending with surface alpha */
1458 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1459 {
1460         unsigned alpha = info->src->alpha;
1461         if(alpha == 128) {
1462                 BlitRGBtoRGBSurfaceAlpha128(info);
1463         } else {
1464                 int width = info->d_width;
1465                 int height = info->d_height;
1466                 Uint32 *srcp = (Uint32 *)info->s_pixels;
1467                 int srcskip = info->s_skip >> 2;
1468                 Uint32 *dstp = (Uint32 *)info->d_pixels;
1469                 int dstskip = info->d_skip >> 2;
1470                 Uint32 s;
1471                 Uint32 d;
1472                 Uint32 s1;
1473                 Uint32 d1;
1474
1475                 while(height--) {
1476                         DUFFS_LOOP_DOUBLE2({
1477                                 /* One Pixel Blend */
1478                                 s = *srcp;
1479                                 d = *dstp;
1480                                 s1 = s & 0xff00ff;
1481                                 d1 = d & 0xff00ff;
1482                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
1483                                      & 0xff00ff;
1484                                 s &= 0xff00;
1485                                 d &= 0xff00;
1486                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1487                                 *dstp = d1 | d | 0xff000000;
1488                                 ++srcp;
1489                                 ++dstp;
1490                         },{
1491                                 /* Two Pixels Blend */
1492                                 s = *srcp;
1493                                 d = *dstp;
1494                                 s1 = s & 0xff00ff;
1495                                 d1 = d & 0xff00ff;
1496                                 d1 += (s1 - d1) * alpha >> 8;
1497                                 d1 &= 0xff00ff;
1498
1499                                 s = ((s & 0xff00) >> 8) |
1500                                         ((srcp[1] & 0xff00) << 8);
1501                                 d = ((d & 0xff00) >> 8) |
1502                                         ((dstp[1] & 0xff00) << 8);
1503                                 d += (s - d) * alpha >> 8;
1504                                 d &= 0x00ff00ff;
1505
1506                                 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1507                                 ++srcp;
1508
1509                                 s1 = *srcp;
1510                                 d1 = *dstp;
1511                                 s1 &= 0xff00ff;
1512                                 d1 &= 0xff00ff;
1513                                 d1 += (s1 - d1) * alpha >> 8;
1514                                 d1 &= 0xff00ff;
1515
1516                                 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1517                                 ++srcp;
1518                                 ++dstp;
1519                         }, width);
1520                         srcp += srcskip;
1521                         dstp += dstskip;
1522                 }
1523         }
1524 }
1525
1526 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
1527 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1528 {
1529         int width = info->d_width;
1530         int height = info->d_height;
1531         Uint32 *srcp = (Uint32 *)info->s_pixels;
1532         int srcskip = info->s_skip >> 2;
1533         Uint32 *dstp = (Uint32 *)info->d_pixels;
1534         int dstskip = info->d_skip >> 2;
1535
1536         while(height--) {
1537             DUFFS_LOOP4({
1538                 Uint32 dalpha;
1539                 Uint32 d;
1540                 Uint32 s1;
1541                 Uint32 d1;
1542                 Uint32 s = *srcp;
1543                 Uint32 alpha = s >> 24;
1544                 /* FIXME: Here we special-case opaque alpha since the
1545                    compositioning used (>>8 instead of /255) doesn't handle
1546                    it correctly. Also special-case alpha=0 for speed?
1547                    Benchmark this! */
1548                 if(alpha) {
1549                   if(alpha == SDL_ALPHA_OPAQUE) {
1550                     *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1551                   } else {
1552                     /*
1553                      * take out the middle component (green), and process
1554                      * the other two in parallel. One multiply less.
1555                      */
1556                     d = *dstp;
1557                     dalpha = d & 0xff000000;
1558                     s1 = s & 0xff00ff;
1559                     d1 = d & 0xff00ff;
1560                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1561                     s &= 0xff00;
1562                     d &= 0xff00;
1563                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1564                     *dstp = d1 | d | dalpha;
1565                   }
1566                 }
1567                 ++srcp;
1568                 ++dstp;
1569             }, width);
1570             srcp += srcskip;
1571             dstp += dstskip;
1572         }
1573 }
1574
1575 #if GCC_ASMBLIT
1576 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1577 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1578 {
1579         int width = info->d_width;
1580         int height = info->d_height;
1581         Uint32 *srcp = (Uint32 *)info->s_pixels;
1582         int srcskip = info->s_skip >> 2;
1583         Uint32 *dstp = (Uint32 *)info->d_pixels;
1584         int dstskip = info->d_skip >> 2;
1585         SDL_PixelFormat* sf = info->src;
1586         Uint32 amask = sf->Amask;
1587
1588         __asm__ (
1589         /* make mm6 all zeros. */
1590         "pxor       %%mm6, %%mm6\n"
1591
1592         /* Make a mask to preserve the alpha. */
1593         "movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1594         "punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1595         "pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1596         "movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1597         "pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1598
1599         /* form channel masks */
1600         "movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1601         "packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1602         "packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1603         "pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1604
1605         /* get alpha channel shift */
1606         "movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1607
1608           : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1609
1610         while(height--) {
1611
1612             DUFFS_LOOP4({
1613                 Uint32 alpha;
1614
1615                 __asm__ (
1616                 "prefetch 64(%0)\n"
1617                 "prefetch 64(%1)\n"
1618                         : : "r" (srcp), "r" (dstp) );
1619
1620                 alpha = *srcp & amask;
1621                 /* FIXME: Here we special-case opaque alpha since the
1622                    compositioning used (>>8 instead of /255) doesn't handle
1623                    it correctly. Also special-case alpha=0 for speed?
1624                    Benchmark this! */
1625                 if(alpha == 0) {
1626                     /* do nothing */
1627                 }
1628                 else if(alpha == amask) {
1629                         /* opaque alpha -- copy RGB, keep dst alpha */
1630                     /* using MMX here to free up regular registers for other things */
1631                             __asm__ (
1632                     "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1633                     "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1634                     "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1635                     "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1636                     "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1637                     "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1638
1639                      : : "r" (srcp), "r" (dstp) );
1640                 }
1641
1642                 else {
1643                             __asm__ (
1644                     /* load in the source, and dst. */
1645                     "movd      (%0), %%mm0\n"               /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1646                     "movd      (%1), %%mm1\n"               /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1647
1648                     /* Move the src alpha into mm2 */
1649
1650                     /* if supporting pshufw */
1651                     /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1652                     /*"psrlw     $8, %%mm2\n" */
1653
1654                     /* else: */
1655                     "movd       %2,    %%mm2\n"
1656                     "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1657                     "punpcklwd  %%mm2, %%mm2\n"             /* mm2 = 0 0 0 0 |  0 As  0  As */
1658                     "punpckldq  %%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1659                     "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1660
1661                     /* move the colors into words. */
1662                     "punpcklbw %%mm6, %%mm0\n"              /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1663                     "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1664
1665                     /* src - dst */
1666                     "psubw    %%mm1, %%mm0\n"               /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1667
1668                     /* A * (src-dst) */
1669                     "pmullw    %%mm2, %%mm0\n"              /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1670                     "psrlw     $8,    %%mm0\n"              /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1671                     "paddb     %%mm1, %%mm0\n"              /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1672
1673                     "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1674
1675                     "movd      %%mm0, (%1)\n"               /* result in mm0 */
1676
1677                      : : "r" (srcp), "r" (dstp), "r" (alpha) );
1678
1679                 }
1680                 ++srcp;
1681                 ++dstp;
1682             }, width);
1683             srcp += srcskip;
1684             dstp += dstskip;
1685         }
1686
1687         __asm__ (
1688         "emms\n"
1689                 :   );
1690 }
1691 /* End GCC_ASMBLIT*/
1692
1693 #elif MSVC_ASMBLIT
1694 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1695 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1696 {
1697         int width = info->d_width;
1698         int height = info->d_height;
1699         Uint32 *srcp = (Uint32 *)info->s_pixels;
1700         int srcskip = info->s_skip >> 2;
1701         Uint32 *dstp = (Uint32 *)info->d_pixels;
1702         int dstskip = info->d_skip >> 2;
1703         SDL_PixelFormat* sf = info->src;
1704         Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1705         Uint32 amask = sf->Amask;
1706         Uint32 ashift = sf->Ashift;
1707         Uint64 multmask;
1708
1709         __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1710
1711         mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1712         multmask = ~(0xFFFFi64 << (ashift * 2));
1713         dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1714
1715         while(height--) {
1716             DUFFS_LOOP4({
1717                 Uint32 alpha;
1718
1719                 _m_prefetch(srcp + 16);
1720                 _m_prefetch(dstp + 16);
1721
1722                 alpha = *srcp & amask;
1723                 if (alpha == 0) {
1724                         /* do nothing */
1725                 } else if (alpha == amask) {
1726                         /* copy RGB, keep dst alpha */
1727                         *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1728                 } else {
1729                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1730                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1731
1732                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1733                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1734
1735                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1736                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1737                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1738                         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1739                         mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1740
1741                         /* blend */
1742                         src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1743                         src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1744                         src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1745                         dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1746                         dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1747
1748                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1749                 }
1750                 ++srcp;
1751                 ++dstp;
1752             }, width);
1753             srcp += srcskip;
1754             dstp += dstskip;
1755         }
1756         _mm_empty();
1757 }
1758 /* End MSVC_ASMBLIT */
1759
1760 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1761
1762 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1763
1764 /* blend a single 16 bit pixel at 50% */
1765 #define BLEND16_50(d, s, mask)                                          \
1766         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1767
1768 /* blend two 16 bit pixels at 50% */
1769 #define BLEND2x16_50(d, s, mask)                                             \
1770         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1771          + (s & d & (~(mask | mask << 16))))
1772
1773 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1774 {
1775         int width = info->d_width;
1776         int height = info->d_height;
1777         Uint16 *srcp = (Uint16 *)info->s_pixels;
1778         int srcskip = info->s_skip >> 1;
1779         Uint16 *dstp = (Uint16 *)info->d_pixels;
1780         int dstskip = info->d_skip >> 1;
1781
1782         while(height--) {
1783                 if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1784                         /*
1785                          * Source and destination not aligned, pipeline it.
1786                          * This is mostly a win for big blits but no loss for
1787                          * small ones
1788                          */
1789                         Uint32 prev_sw;
1790                         int w = width;
1791
1792                         /* handle odd destination */
1793                         if((uintptr_t)dstp & 2) {
1794                                 Uint16 d = *dstp, s = *srcp;
1795                                 *dstp = BLEND16_50(d, s, mask);
1796                                 dstp++;
1797                                 srcp++;
1798                                 w--;
1799                         }
1800                         srcp++; /* srcp is now 32-bit aligned */
1801
1802                         /* bootstrap pipeline with first halfword */
1803                         prev_sw = ((Uint32 *)srcp)[-1];
1804
1805                         while(w > 1) {
1806                                 Uint32 sw, dw, s;
1807                                 sw = *(Uint32 *)srcp;
1808                                 dw = *(Uint32 *)dstp;
1809 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1810                                 s = (prev_sw << 16) + (sw >> 16);
1811 #else
1812                                 s = (prev_sw >> 16) + (sw << 16);
1813 #endif
1814                                 prev_sw = sw;
1815                                 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1816                                 dstp += 2;
1817                                 srcp += 2;
1818                                 w -= 2;
1819                         }
1820
1821                         /* final pixel if any */
1822                         if(w) {
1823                                 Uint16 d = *dstp, s;
1824 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
1825                                 s = (Uint16)prev_sw;
1826 #else
1827                                 s = (Uint16)(prev_sw >> 16);
1828 #endif
1829                                 *dstp = BLEND16_50(d, s, mask);
1830                                 srcp++;
1831                                 dstp++;
1832                         }
1833                         srcp += srcskip - 1;
1834                         dstp += dstskip;
1835                 } else {
1836                         /* source and destination are aligned */
1837                         int w = width;
1838
1839                         /* first odd pixel? */
1840                         if((uintptr_t)srcp & 2) {
1841                                 Uint16 d = *dstp, s = *srcp;
1842                                 *dstp = BLEND16_50(d, s, mask);
1843                                 srcp++;
1844                                 dstp++;
1845                                 w--;
1846                         }
1847                         /* srcp and dstp are now 32-bit aligned */
1848
1849                         while(w > 1) {
1850                                 Uint32 sw = *(Uint32 *)srcp;
1851                                 Uint32 dw = *(Uint32 *)dstp;
1852                                 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1853                                 srcp += 2;
1854                                 dstp += 2;
1855                                 w -= 2;
1856                         }
1857
1858                         /* last odd pixel? */
1859                         if(w) {
1860                                 Uint16 d = *dstp, s = *srcp;
1861                                 *dstp = BLEND16_50(d, s, mask);
1862                                 srcp++;
1863                                 dstp++;
1864                         }
1865                         srcp += srcskip;
1866                         dstp += dstskip;
1867                 }
1868         }
1869 }
1870
1871 #if GCC_ASMBLIT
1872 /* fast RGB565->RGB565 blending with surface alpha */
1873 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1874 {
1875         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1876         if(alpha == 128) {
1877                 Blit16to16SurfaceAlpha128(info, 0xf7de);
1878         } else {
1879                 int width = info->d_width;
1880                 int height = info->d_height;
1881                 Uint16 *srcp = (Uint16 *)info->s_pixels;
1882                 int srcskip = info->s_skip >> 1;
1883                 Uint16 *dstp = (Uint16 *)info->d_pixels;
1884                 int dstskip = info->d_skip >> 1;
1885                 Uint32 s, d;
1886                 Uint64 load;
1887
1888                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
1889                 load = alpha;
1890                 alpha >>= 3;            /* downscale alpha to 5 bits */
1891
1892                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1893                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1894                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1895                 /* position alpha to allow for mullo and mulhi on diff channels
1896                    to reduce the number of operations */
1897                 psllq_i2r(3, mm0);
1898
1899                 /* Setup the 565 color channel masks */
1900                 load = 0x07E007E007E007E0ULL;
1901                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1902                 load = 0x001F001F001F001FULL;
1903                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1904                 while(height--) {
1905                         DUFFS_LOOP_QUATRO2(
1906                         {
1907                                 s = *srcp++;
1908                                 d = *dstp;
1909                                 /*
1910                                  * shift out the middle component (green) to
1911                                  * the high 16 bits, and process all three RGB
1912                                  * components at the same time.
1913                                  */
1914                                 s = (s | s << 16) & 0x07e0f81f;
1915                                 d = (d | d << 16) & 0x07e0f81f;
1916                                 d += (s - d) * alpha >> 5;
1917                                 d &= 0x07e0f81f;
1918                                 *dstp++ = d | d >> 16;
1919                         },{
1920                                 s = *srcp++;
1921                                 d = *dstp;
1922                                 /*
1923                                  * shift out the middle component (green) to
1924                                  * the high 16 bits, and process all three RGB
1925                                  * components at the same time.
1926                                  */
1927                                 s = (s | s << 16) & 0x07e0f81f;
1928                                 d = (d | d << 16) & 0x07e0f81f;
1929                                 d += (s - d) * alpha >> 5;
1930                                 d &= 0x07e0f81f;
1931                                 *dstp++ = d | d >> 16;
1932                                 s = *srcp++;
1933                                 d = *dstp;
1934                                 /*
1935                                  * shift out the middle component (green) to
1936                                  * the high 16 bits, and process all three RGB
1937                                  * components at the same time.
1938                                  */
1939                                 s = (s | s << 16) & 0x07e0f81f;
1940                                 d = (d | d << 16) & 0x07e0f81f;
1941                                 d += (s - d) * alpha >> 5;
1942                                 d &= 0x07e0f81f;
1943                                 *dstp++ = d | d >> 16;
1944                         },{
1945                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1946                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1947
1948                                 /* red -- does not need a mask since the right shift clears
1949                                    the uninteresting bits */
1950                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1951                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1952                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1953                                 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1954
1955                                 /* blend */
1956                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1957                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1958                                 /* alpha used is actually 11 bits
1959                                    11 + 5 = 16 bits, so the sign bits are lost */
1960                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1961                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1962                                 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1963
1964                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
1965
1966                                 /* green -- process the bits in place */
1967                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1968                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1969                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1970                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1971
1972                                 /* blend */
1973                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1974                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1975                                 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1976                                    bits are gone and the sign bits present */
1977                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1978                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1979
1980                                 por_r2r(mm6, mm1); /* save new greens in dsts */
1981
1982                                 /* blue */
1983                                 movq_r2r(mm2, mm5); /* src -> mm5 */
1984                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
1985                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1986                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1987
1988                                 /* blend */
1989                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1990                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1991                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
1992                                    the interesting bits will need to be MASKed */
1993                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1994                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1995                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1996
1997                                 por_r2r(mm6, mm1); /* save new blues in dsts */
1998
1999                                 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2000
2001                                 srcp += 4;
2002                                 dstp += 4;
2003                         }, width);
2004                         srcp += srcskip;
2005                         dstp += dstskip;
2006                 }
2007                 emms();
2008         }
2009 }
2010
2011 /* fast RGB555->RGB555 blending with surface alpha */
2012 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2013 {
2014         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2015         if(alpha == 128) {
2016                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2017         } else {
2018                 int width = info->d_width;
2019                 int height = info->d_height;
2020                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2021                 int srcskip = info->s_skip >> 1;
2022                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2023                 int dstskip = info->d_skip >> 1;
2024                 Uint32 s, d;
2025                 Uint64 load;
2026
2027                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2028                 load = alpha;
2029                 alpha >>= 3;            /* downscale alpha to 5 bits */
2030
2031                 movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2032                 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2033                 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2034                 /* position alpha to allow for mullo and mulhi on diff channels
2035                    to reduce the number of operations */
2036                 psllq_i2r(3, mm0);
2037
2038                 /* Setup the 555 color channel masks */
2039                 load = 0x03E003E003E003E0ULL;
2040                 movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2041                 load = 0x001F001F001F001FULL;
2042                 movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2043                 while(height--) {
2044                         DUFFS_LOOP_QUATRO2(
2045                         {
2046                                 s = *srcp++;
2047                                 d = *dstp;
2048                                 /*
2049                                  * shift out the middle component (green) to
2050                                  * the high 16 bits, and process all three RGB
2051                                  * components at the same time.
2052                                  */
2053                                 s = (s | s << 16) & 0x03e07c1f;
2054                                 d = (d | d << 16) & 0x03e07c1f;
2055                                 d += (s - d) * alpha >> 5;
2056                                 d &= 0x03e07c1f;
2057                                 *dstp++ = d | d >> 16;
2058                         },{
2059                                 s = *srcp++;
2060                                 d = *dstp;
2061                                 /*
2062                                  * shift out the middle component (green) to
2063                                  * the high 16 bits, and process all three RGB
2064                                  * components at the same time.
2065                                  */
2066                                 s = (s | s << 16) & 0x03e07c1f;
2067                                 d = (d | d << 16) & 0x03e07c1f;
2068                                 d += (s - d) * alpha >> 5;
2069                                 d &= 0x03e07c1f;
2070                                 *dstp++ = d | d >> 16;
2071                                 s = *srcp++;
2072                                 d = *dstp;
2073                                 /*
2074                                  * shift out the middle component (green) to
2075                                  * the high 16 bits, and process all three RGB
2076                                  * components at the same time.
2077                                  */
2078                                 s = (s | s << 16) & 0x03e07c1f;
2079                                 d = (d | d << 16) & 0x03e07c1f;
2080                                 d += (s - d) * alpha >> 5;
2081                                 d &= 0x03e07c1f;
2082                                 *dstp++ = d | d >> 16;
2083                         },{
2084                                 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2085                                 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2086
2087                                 /* red -- process the bits in place */
2088                                 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2089                                         /* by reusing the GREEN mask we free up another mmx
2090                                            register to accumulate the result */
2091
2092                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2093                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2094                                 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2095                                 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2096
2097                                 /* blend */
2098                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2099                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2100                                 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2101                                    cleared by a MASK below */
2102                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2103                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2104                                 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2105
2106                                 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2107
2108                                 movq_r2r(mm6, mm1); /* save new reds in dsts */
2109
2110                                 /* green -- process the bits in place */
2111                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2112                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2113                                 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2114                                 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2115
2116                                 /* blend */
2117                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2118                                 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2119                                 /* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2120                                    bits are gone and the sign bits present */
2121                                 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2122                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2123
2124                                 por_r2r(mm6, mm1); /* save new greens in dsts */
2125
2126                                 /* blue */
2127                                 movq_r2r(mm2, mm5); /* src -> mm5 */
2128                                 movq_r2r(mm3, mm6); /* dst -> mm6 */
2129                                 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2130                                 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2131
2132                                 /* blend */
2133                                 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2134                                 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2135                                 /* 11 + 5 = 16 bits, so the sign bits are lost and
2136                                    the interesting bits will need to be MASKed */
2137                                 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2138                                 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2139                                 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2140
2141                                 por_r2r(mm6, mm1); /* save new blues in dsts */
2142
2143                                 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2144
2145                                 srcp += 4;
2146                                 dstp += 4;
2147                         }, width);
2148                         srcp += srcskip;
2149                         dstp += dstskip;
2150                 }
2151                 emms();
2152         }
2153 }
2154 /* End GCC_ASMBLIT */
2155
2156 #elif MSVC_ASMBLIT
2157 /* fast RGB565->RGB565 blending with surface alpha */
2158 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2159 {
2160         unsigned alpha = info->src->alpha;
2161         if(alpha == 128) {
2162                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2163         } else {
2164                 int width = info->d_width;
2165                 int height = info->d_height;
2166                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2167                 int srcskip = info->s_skip >> 1;
2168                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2169                 int dstskip = info->d_skip >> 1;
2170                 Uint32 s, d;
2171
2172                 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2173
2174                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2175                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2176                 alpha >>= 3;            /* downscale alpha to 5 bits */
2177
2178                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2179                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2180                 /* position alpha to allow for mullo and mulhi on diff channels
2181                    to reduce the number of operations */
2182                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2183
2184                 /* Setup the 565 color channel masks */
2185                 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2186                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2187
2188                 while(height--) {
2189                         DUFFS_LOOP_QUATRO2(
2190                         {
2191                                 s = *srcp++;
2192                                 d = *dstp;
2193                                 /*
2194                                  * shift out the middle component (green) to
2195                                  * the high 16 bits, and process all three RGB
2196                                  * components at the same time.
2197                                  */
2198                                 s = (s | s << 16) & 0x07e0f81f;
2199                                 d = (d | d << 16) & 0x07e0f81f;
2200                                 d += (s - d) * alpha >> 5;
2201                                 d &= 0x07e0f81f;
2202                                 *dstp++ = (Uint16)(d | d >> 16);
2203                         },{
2204                                 s = *srcp++;
2205                                 d = *dstp;
2206                                 /*
2207                                  * shift out the middle component (green) to
2208                                  * the high 16 bits, and process all three RGB
2209                                  * components at the same time.
2210                                  */
2211                                 s = (s | s << 16) & 0x07e0f81f;
2212                                 d = (d | d << 16) & 0x07e0f81f;
2213                                 d += (s - d) * alpha >> 5;
2214                                 d &= 0x07e0f81f;
2215                                 *dstp++ = (Uint16)(d | d >> 16);
2216                                 s = *srcp++;
2217                                 d = *dstp;
2218                                 /*
2219                                  * shift out the middle component (green) to
2220                                  * the high 16 bits, and process all three RGB
2221                                  * components at the same time.
2222                                  */
2223                                 s = (s | s << 16) & 0x07e0f81f;
2224                                 d = (d | d << 16) & 0x07e0f81f;
2225                                 d += (s - d) * alpha >> 5;
2226                                 d &= 0x07e0f81f;
2227                                 *dstp++ = (Uint16)(d | d >> 16);
2228                         },{
2229                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2230                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2231
2232                                 /* red */
2233                                 src2 = src1;
2234                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2235
2236                                 dst2 = dst1;
2237                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2238
2239                                 /* blend */
2240                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2241                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2242                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2243                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2244                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2245
2246                                 mm_res = dst2; /* RED -> mm_res */
2247
2248                                 /* green -- process the bits in place */
2249                                 src2 = src1;
2250                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2251
2252                                 dst2 = dst1;
2253                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2254
2255                                 /* blend */
2256                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2257                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2258                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2259                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2260
2261                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2262
2263                                 /* blue */
2264                                 src2 = src1;
2265                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2266
2267                                 dst2 = dst1;
2268                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2269
2270                                 /* blend */
2271                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2272                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2273                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2274                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2275                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2276
2277                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2278
2279                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2280
2281                                 srcp += 4;
2282                                 dstp += 4;
2283                         }, width);
2284                         srcp += srcskip;
2285                         dstp += dstskip;
2286                 }
2287                 _mm_empty();
2288         }
2289 }
2290
2291 /* fast RGB555->RGB555 blending with surface alpha */
2292 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2293 {
2294         unsigned alpha = info->src->alpha;
2295         if(alpha == 128) {
2296                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2297         } else {
2298                 int width = info->d_width;
2299                 int height = info->d_height;
2300                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2301                 int srcskip = info->s_skip >> 1;
2302                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2303                 int dstskip = info->d_skip >> 1;
2304                 Uint32 s, d;
2305
2306                 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2307
2308                 alpha &= ~(1+2+4);              /* cut alpha to get the exact same behaviour */
2309                 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2310                 alpha >>= 3;            /* downscale alpha to 5 bits */
2311
2312                 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2313                 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2314                 /* position alpha to allow for mullo and mulhi on diff channels
2315                    to reduce the number of operations */
2316                 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2317
2318                 /* Setup the 555 color channel masks */
2319                 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2320                 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2321                 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2322
2323                 while(height--) {
2324                         DUFFS_LOOP_QUATRO2(
2325                         {
2326                                 s = *srcp++;
2327                                 d = *dstp;
2328                                 /*
2329                                  * shift out the middle component (green) to
2330                                  * the high 16 bits, and process all three RGB
2331                                  * components at the same time.
2332                                  */
2333                                 s = (s | s << 16) & 0x03e07c1f;
2334                                 d = (d | d << 16) & 0x03e07c1f;
2335                                 d += (s - d) * alpha >> 5;
2336                                 d &= 0x03e07c1f;
2337                                 *dstp++ = (Uint16)(d | d >> 16);
2338                         },{
2339                                 s = *srcp++;
2340                                 d = *dstp;
2341                                 /*
2342                                  * shift out the middle component (green) to
2343                                  * the high 16 bits, and process all three RGB
2344                                  * components at the same time.
2345                                  */
2346                                 s = (s | s << 16) & 0x03e07c1f;
2347                                 d = (d | d << 16) & 0x03e07c1f;
2348                                 d += (s - d) * alpha >> 5;
2349                                 d &= 0x03e07c1f;
2350                                 *dstp++ = (Uint16)(d | d >> 16);
2351                                 s = *srcp++;
2352                                 d = *dstp;
2353                                 /*
2354                                  * shift out the middle component (green) to
2355                                  * the high 16 bits, and process all three RGB
2356                                  * components at the same time.
2357                                  */
2358                                 s = (s | s << 16) & 0x03e07c1f;
2359                                 d = (d | d << 16) & 0x03e07c1f;
2360                                 d += (s - d) * alpha >> 5;
2361                                 d &= 0x03e07c1f;
2362                                 *dstp++ = (Uint16)(d | d >> 16);
2363                         },{
2364                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2365                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2366
2367                                 /* red -- process the bits in place */
2368                                 src2 = src1;
2369                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2370
2371                                 dst2 = dst1;
2372                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2373
2374                                 /* blend */
2375                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2376                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2377                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2378                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2379                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2380
2381                                 mm_res = dst2; /* RED -> mm_res */
2382
2383                                 /* green -- process the bits in place */
2384                                 src2 = src1;
2385                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2386
2387                                 dst2 = dst1;
2388                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2389
2390                                 /* blend */
2391                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2392                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2393                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2394                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2395
2396                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2397
2398                                 /* blue */
2399                                 src2 = src1; /* src -> src2 */
2400                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2401
2402                                 dst2 = dst1; /* dst -> dst2 */
2403                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2404
2405                                 /* blend */
2406                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2407                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2408                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2409                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2410                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2411
2412                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2413
2414                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2415
2416                                 srcp += 4;
2417                                 dstp += 4;
2418                         }, width);
2419                         srcp += srcskip;
2420                         dstp += dstskip;
2421                 }
2422                 _mm_empty();
2423         }
2424 }
2425 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2426
2427 /* fast RGB565->RGB565 blending with surface alpha */
2428 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2429 {
2430         unsigned alpha = info->src->alpha;
2431         if(alpha == 128) {
2432                 Blit16to16SurfaceAlpha128(info, 0xf7de);
2433         } else {
2434                 int width = info->d_width;
2435                 int height = info->d_height;
2436                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2437                 int srcskip = info->s_skip >> 1;
2438                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2439                 int dstskip = info->d_skip >> 1;
2440                 alpha >>= 3;    /* downscale alpha to 5 bits */
2441
2442                 while(height--) {
2443                         DUFFS_LOOP4({
2444                                 Uint32 s = *srcp++;
2445                                 Uint32 d = *dstp;
2446                                 /*
2447                                  * shift out the middle component (green) to
2448                                  * the high 16 bits, and process all three RGB
2449                                  * components at the same time.
2450                                  */
2451                                 s = (s | s << 16) & 0x07e0f81f;
2452                                 d = (d | d << 16) & 0x07e0f81f;
2453                                 d += (s - d) * alpha >> 5;
2454                                 d &= 0x07e0f81f;
2455                                 *dstp++ = (Uint16)(d | d >> 16);
2456                         }, width);
2457                         srcp += srcskip;
2458                         dstp += dstskip;
2459                 }
2460         }
2461 }
2462
2463 /* fast RGB555->RGB555 blending with surface alpha */
2464 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2465 {
2466         unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2467         if(alpha == 128) {
2468                 Blit16to16SurfaceAlpha128(info, 0xfbde);
2469         } else {
2470                 int width = info->d_width;
2471                 int height = info->d_height;
2472                 Uint16 *srcp = (Uint16 *)info->s_pixels;
2473                 int srcskip = info->s_skip >> 1;
2474                 Uint16 *dstp = (Uint16 *)info->d_pixels;
2475                 int dstskip = info->d_skip >> 1;
2476                 alpha >>= 3;            /* downscale alpha to 5 bits */
2477
2478                 while(height--) {
2479                         DUFFS_LOOP4({
2480                                 Uint32 s = *srcp++;
2481                                 Uint32 d = *dstp;
2482                                 /*
2483                                  * shift out the middle component (green) to
2484                                  * the high 16 bits, and process all three RGB
2485                                  * components at the same time.
2486                                  */
2487                                 s = (s | s << 16) & 0x03e07c1f;
2488                                 d = (d | d << 16) & 0x03e07c1f;
2489                                 d += (s - d) * alpha >> 5;
2490                                 d &= 0x03e07c1f;
2491                                 *dstp++ = (Uint16)(d | d >> 16);
2492                         }, width);
2493                         srcp += srcskip;
2494                         dstp += dstskip;
2495                 }
2496         }
2497 }
2498
2499 /* fast ARGB8888->RGB565 blending with pixel alpha */
2500 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2501 {
2502         int width = info->d_width;
2503         int height = info->d_height;
2504         Uint32 *srcp = (Uint32 *)info->s_pixels;
2505         int srcskip = info->s_skip >> 2;
2506         Uint16 *dstp = (Uint16 *)info->d_pixels;
2507         int dstskip = info->d_skip >> 1;
2508
2509         while(height--) {
2510             DUFFS_LOOP4({
2511                 Uint32 s = *srcp;
2512                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2513                 /* FIXME: Here we special-case opaque alpha since the
2514                    compositioning used (>>8 instead of /255) doesn't handle
2515                    it correctly. Also special-case alpha=0 for speed?
2516                    Benchmark this! */
2517                 if(alpha) {
2518                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2519                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2520                   } else {
2521                     Uint32 d = *dstp;
2522                     /*
2523                      * convert source and destination to G0RAB65565
2524                      * and blend all components at the same time
2525                      */
2526                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2527                       + (s >> 3 & 0x1f);
2528                     d = (d | d << 16) & 0x07e0f81f;
2529                     d += (s - d) * alpha >> 5;
2530                     d &= 0x07e0f81f;
2531                     *dstp = (Uint16)(d | d >> 16);
2532                   }
2533                 }
2534                 srcp++;
2535                 dstp++;
2536             }, width);
2537             srcp += srcskip;
2538             dstp += dstskip;
2539         }
2540 }
2541
2542 /* fast ARGB8888->RGB555 blending with pixel alpha */
2543 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2544 {
2545         int width = info->d_width;
2546         int height = info->d_height;
2547         Uint32 *srcp = (Uint32 *)info->s_pixels;
2548         int srcskip = info->s_skip >> 2;
2549         Uint16 *dstp = (Uint16 *)info->d_pixels;
2550         int dstskip = info->d_skip >> 1;
2551
2552         while(height--) {
2553             DUFFS_LOOP4({
2554                 unsigned alpha;
2555                 Uint32 s = *srcp;
2556                 alpha = s >> 27; /* downscale alpha to 5 bits */
2557                 /* FIXME: Here we special-case opaque alpha since the
2558                    compositioning used (>>8 instead of /255) doesn't handle
2559                    it correctly. Also special-case alpha=0 for speed?
2560                    Benchmark this! */
2561                 if(alpha) {
2562                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2563                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2564                   } else {
2565                     Uint32 d = *dstp;
2566                     /*
2567                      * convert source and destination to G0RAB65565
2568                      * and blend all components at the same time
2569                      */
2570                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2571                       + (s >> 3 & 0x1f);
2572                     d = (d | d << 16) & 0x03e07c1f;
2573                     d += (s - d) * alpha >> 5;
2574                     d &= 0x03e07c1f;
2575                     *dstp = (Uint16)(d | d >> 16);
2576                   }
2577                 }
2578                 srcp++;
2579                 dstp++;
2580             }, width);
2581             srcp += srcskip;
2582             dstp += dstskip;
2583         }
2584 }
2585
2586 /* General (slow) N->N blending with per-surface alpha */
2587 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2588 {
2589         int width = info->d_width;
2590         int height = info->d_height;
2591         Uint8 *src = info->s_pixels;
2592         int srcskip = info->s_skip;
2593         Uint8 *dst = info->d_pixels;
2594         int dstskip = info->d_skip;
2595         SDL_PixelFormat *srcfmt = info->src;
2596         SDL_PixelFormat *dstfmt = info->dst;
2597         int srcbpp = srcfmt->BytesPerPixel;
2598         int dstbpp = dstfmt->BytesPerPixel;
2599         unsigned sA = srcfmt->alpha;
2600         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2601
2602         if(sA) {
2603           while ( height-- ) {
2604             DUFFS_LOOP4(
2605             {
2606                 Uint32 Pixel;
2607                 unsigned sR;
2608                 unsigned sG;
2609                 unsigned sB;
2610                 unsigned dR;
2611                 unsigned dG;
2612                 unsigned dB;
2613                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2614                 DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2615                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2616                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2617                 src += srcbpp;
2618                 dst += dstbpp;
2619             },
2620             width);
2621             src += srcskip;
2622             dst += dstskip;
2623           }
2624         }
2625 }
2626
2627 /* General (slow) colorkeyed N->N blending with per-surface alpha */
2628 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2629 {
2630         int width = info->d_width;
2631         int height = info->d_height;
2632         Uint8 *src = info->s_pixels;
2633         int srcskip = info->s_skip;
2634         Uint8 *dst = info->d_pixels;
2635         int dstskip = info->d_skip;
2636         SDL_PixelFormat *srcfmt = info->src;
2637         SDL_PixelFormat *dstfmt = info->dst;
2638         Uint32 ckey = srcfmt->colorkey;
2639         int srcbpp = srcfmt->BytesPerPixel;
2640         int dstbpp = dstfmt->BytesPerPixel;
2641         unsigned sA = srcfmt->alpha;
2642         unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2643
2644         while ( height-- ) {
2645             DUFFS_LOOP4(
2646             {
2647                 Uint32 Pixel;
2648                 unsigned sR;
2649                 unsigned sG;
2650                 unsigned sB;
2651                 unsigned dR;
2652                 unsigned dG;
2653                 unsigned dB;
2654                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2655                 if(sA && Pixel != ckey) {
2656                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2657                     DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2658                     ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2659                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2660                 }
2661                 src += srcbpp;
2662                 dst += dstbpp;
2663             },
2664             width);
2665             src += srcskip;
2666             dst += dstskip;
2667         }
2668 }
2669
2670 /* General (slow) N->N blending with pixel alpha */
2671 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2672 {
2673         int width = info->d_width;
2674         int height = info->d_height;
2675         Uint8 *src = info->s_pixels;
2676         int srcskip = info->s_skip;
2677         Uint8 *dst = info->d_pixels;
2678         int dstskip = info->d_skip;
2679         SDL_PixelFormat *srcfmt = info->src;
2680         SDL_PixelFormat *dstfmt = info->dst;
2681
2682         int  srcbpp;
2683         int  dstbpp;
2684
2685         /* Set up some basic variables */
2686         srcbpp = srcfmt->BytesPerPixel;
2687         dstbpp = dstfmt->BytesPerPixel;
2688
2689         /* FIXME: for 8bpp source alpha, this doesn't get opaque values
2690            quite right. for <8bpp source alpha, it gets them very wrong
2691            (check all macros!)
2692            It is unclear whether there is a good general solution that doesn't
2693            need a branch (or a divide). */
2694         while ( height-- ) {
2695             DUFFS_LOOP4(
2696             {
2697                 Uint32 Pixel;
2698                 unsigned sR;
2699                 unsigned sG;
2700                 unsigned sB;
2701                 unsigned dR;
2702                 unsigned dG;
2703                 unsigned dB;
2704                 unsigned sA;
2705                 unsigned dA;
2706                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2707                 if(sA) {
2708                   DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2709                   ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2710                   ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2711                 }
2712                 src += srcbpp;
2713                 dst += dstbpp;
2714             },
2715             width);
2716             src += srcskip;
2717             dst += dstskip;
2718         }
2719 }
2720
2721
2722 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2723 {
2724     SDL_PixelFormat *sf = surface->format;
2725     SDL_PixelFormat *df = surface->map->dst->format;
2726
2727     if(sf->Amask == 0) {
2728         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2729             if(df->BytesPerPixel == 1)
2730                 return BlitNto1SurfaceAlphaKey;
2731             else
2732 #if SDL_ALTIVEC_BLITTERS
2733         if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2734             !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2735             return Blit32to32SurfaceAlphaKeyAltivec;
2736         else
2737 #endif
2738             return BlitNtoNSurfaceAlphaKey;
2739         } else {
2740             /* Per-surface alpha blits */
2741             switch(df->BytesPerPixel) {
2742             case 1:
2743                 return BlitNto1SurfaceAlpha;
2744
2745             case 2:
2746                 if(surface->map->identity) {
2747                     if(df->Gmask == 0x7e0)
2748                     {
2749 #if MMX_ASMBLIT
2750                 if(SDL_HasMMX())
2751                         return Blit565to565SurfaceAlphaMMX;
2752                 else
2753 #endif
2754                         return Blit565to565SurfaceAlpha;
2755                     }
2756                     else if(df->Gmask == 0x3e0)
2757                     {
2758 #if MMX_ASMBLIT
2759                 if(SDL_HasMMX())
2760                         return Blit555to555SurfaceAlphaMMX;
2761                 else
2762 #endif
2763                         return Blit555to555SurfaceAlpha;
2764                     }
2765                 }
2766                 return BlitNtoNSurfaceAlpha;
2767
2768             case 4:
2769                 if(sf->Rmask == df->Rmask
2770                    && sf->Gmask == df->Gmask
2771                    && sf->Bmask == df->Bmask
2772                    && sf->BytesPerPixel == 4)
2773                 {
2774 #if MMX_ASMBLIT
2775                         if(sf->Rshift % 8 == 0
2776                            && sf->Gshift % 8 == 0
2777                            && sf->Bshift % 8 == 0
2778                            && SDL_HasMMX())
2779                             return BlitRGBtoRGBSurfaceAlphaMMX;
2780 #endif
2781                         if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2782                         {
2783 #if SDL_ALTIVEC_BLITTERS
2784                                 if(!(surface->map->dst->flags & SDL_HWSURFACE)
2785                                         && SDL_HasAltiVec())
2786                                         return BlitRGBtoRGBSurfaceAlphaAltivec;
2787 #endif
2788                                 return BlitRGBtoRGBSurfaceAlpha;
2789                         }
2790                 }
2791 #if SDL_ALTIVEC_BLITTERS
2792                 if((sf->BytesPerPixel == 4) &&
2793                    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2794                         return Blit32to32SurfaceAlphaAltivec;
2795                 else
2796 #endif
2797                         return BlitNtoNSurfaceAlpha;
2798
2799             case 3:
2800             default:
2801                 return BlitNtoNSurfaceAlpha;
2802             }
2803         }
2804     } else {
2805         /* Per-pixel alpha blits */
2806         switch(df->BytesPerPixel) {
2807         case 1:
2808             return BlitNto1PixelAlpha;
2809
2810         case 2:
2811 #if SDL_ALTIVEC_BLITTERS
2812         if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2813            df->Gmask == 0x7e0 &&
2814            df->Bmask == 0x1f && SDL_HasAltiVec())
2815             return Blit32to565PixelAlphaAltivec;
2816         else
2817 #endif
2818             if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2819                && sf->Gmask == 0xff00
2820                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2821                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2822                 if(df->Gmask == 0x7e0)
2823                     return BlitARGBto565PixelAlpha;
2824                 else if(df->Gmask == 0x3e0)
2825                     return BlitARGBto555PixelAlpha;
2826             }
2827             return BlitNtoNPixelAlpha;
2828
2829         case 4:
2830             if(sf->Rmask == df->Rmask
2831                && sf->Gmask == df->Gmask
2832                && sf->Bmask == df->Bmask
2833                && sf->BytesPerPixel == 4)
2834             {
2835 #if MMX_ASMBLIT
2836                 if(sf->Rshift % 8 == 0
2837                    && sf->Gshift % 8 == 0
2838                    && sf->Bshift % 8 == 0
2839                    && sf->Ashift % 8 == 0
2840                    && sf->Aloss == 0)
2841                 {
2842                         if(SDL_Has3DNow())
2843                                 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2844                         if(SDL_HasMMX())
2845                                 return BlitRGBtoRGBPixelAlphaMMX;
2846                 }
2847 #endif
2848                 if(sf->Amask == 0xff000000)
2849                 {
2850 #if SDL_ALTIVEC_BLITTERS
2851                         if(!(surface->map->dst->flags & SDL_HWSURFACE)
2852                                 && SDL_HasAltiVec())
2853                                 return BlitRGBtoRGBPixelAlphaAltivec;
2854 #endif
2855                         return BlitRGBtoRGBPixelAlpha;
2856                 }
2857             }
2858 #if SDL_ALTIVEC_BLITTERS
2859             if (sf->Amask && sf->BytesPerPixel == 4 &&
2860                 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2861                 return Blit32to32PixelAlphaAltivec;
2862             else
2863 #endif
2864                 return BlitNtoNPixelAlpha;
2865
2866         case 3:
2867         default:
2868             return BlitNtoNPixelAlpha;
2869         }
2870     }
2871 }
2872