SDL-1.2.14
[sdl_omap.git] / src / video / SDL_RLEaccel.c
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2009 Sam Lantinga
4
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 /*
25  * RLE encoding for software colorkey and alpha-channel acceleration
26  *
27  * Original version by Sam Lantinga
28  *
29  * Mattias EngdegĂ„rd (Yorick): Rewrite. New encoding format, encoder and
30  * decoder. Added per-surface alpha blitter. Added per-pixel alpha
31  * format, encoder and blitter.
32  *
33  * Many thanks to Xark and johns for hints, benchmarks and useful comments
34  * leading to this code.
35  *
36  * Welcome to Macro Mayhem.
37  */
38
39 /*
40  * The encoding translates the image data to a stream of segments of the form
41  *
42  * <skip> <run> <data>
43  *
44  * where <skip> is the number of transparent pixels to skip,
45  *       <run>  is the number of opaque pixels to blit,
46  * and   <data> are the pixels themselves.
47  *
48  * This basic structure is used both for colorkeyed surfaces, used for simple
49  * binary transparency and for per-surface alpha blending, and for surfaces
50  * with per-pixel alpha. The details differ, however:
51  *
52  * Encoding of colorkeyed surfaces:
53  *
54  *   Encoded pixels always have the same format as the target surface.
55  *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
56  *   where they are 16 bit. This makes the pixel data aligned at all times.
57  *   Segments never wrap around from one scan line to the next.
58  *
59  *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
60  *   beginning of a line.
61  *
62  * Encoding of surfaces with per-pixel alpha:
63  *
64  *   The sequence begins with a struct RLEDestFormat describing the target
65  *   pixel format, to provide reliable un-encoding.
66  *
67  *   Each scan line is encoded twice: First all completely opaque pixels,
68  *   encoded in the target format as described above, and then all
69  *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
70  *   in the following 32-bit format:
71  *
72  *   For 32-bit targets, each pixel has the target RGB format but with
73  *   the alpha value occupying the highest 8 bits. The <skip> and <run>
74  *   counts are 16 bit.
75  * 
76  *   For 16-bit targets, each pixel has the target RGB format, but with
77  *   the middle component (usually green) shifted 16 steps to the left,
78  *   and the hole filled with the 5 most significant bits of the alpha value.
79  *   i.e. if the target has the format         rrrrrggggggbbbbb,
80  *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
81  *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
82  *   for the translucent lines. Two padding bytes may be inserted
83  *   before each translucent line to keep them 32-bit aligned.
84  *
85  *   The end of the sequence is marked by a zero <skip>,<run> pair at the
86  *   beginning of an opaque line.
87  */
88
89 #include "SDL_video.h"
90 #include "SDL_sysvideo.h"
91 #include "SDL_blit.h"
92 #include "SDL_RLEaccel_c.h"
93
94 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
95 #define MMX_ASMBLIT
96 #endif
97
98 #ifdef MMX_ASMBLIT
99 #include "mmx.h"
100 #include "SDL_cpuinfo.h"
101 #endif
102
103 #ifndef MAX
104 #define MAX(a, b) ((a) > (b) ? (a) : (b))
105 #endif
106 #ifndef MIN
107 #define MIN(a, b) ((a) < (b) ? (a) : (b))
108 #endif
109
110 #define PIXEL_COPY(to, from, len, bpp)                  \
111 do {                                                    \
112     if(bpp == 4) {                                      \
113         SDL_memcpy4(to, from, (size_t)(len));           \
114     } else {                                            \
115         SDL_memcpy(to, from, (size_t)(len) * (bpp));    \
116     }                                                   \
117 } while(0)
118
119 /*
120  * Various colorkey blit methods, for opaque and per-surface alpha
121  */
122
123 #define OPAQUE_BLIT(to, from, length, bpp, alpha)       \
124     PIXEL_COPY(to, from, length, bpp)
125
126 #ifdef MMX_ASMBLIT
127
128 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)       \
129     do {                                                        \
130         Uint32 *srcp = (Uint32 *)(from);                        \
131         Uint32 *dstp = (Uint32 *)(to);                          \
132         int i = 0x00FF00FF;                                     \
133         movd_m2r(*(&i), mm3);                                   \
134         punpckldq_r2r(mm3, mm3);                                \
135         i = 0xFF000000;                                         \
136         movd_m2r(*(&i), mm7);                                   \
137         punpckldq_r2r(mm7, mm7);                                \
138         i = alpha | alpha << 16;                                \
139         movd_m2r(*(&i), mm4);                                   \
140         punpckldq_r2r(mm4, mm4);                                \
141         pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */              \
142         pxor_r2r(mm7, mm5); /* make clear alpha mask */         \
143         i = length;                                             \
144         if(i & 1) {                                             \
145           movd_m2r((*srcp), mm1); /* src -> mm1 */              \
146           punpcklbw_r2r(mm1, mm1);                              \
147           pand_r2r(mm3, mm1);                                   \
148           movd_m2r((*dstp), mm2); /* dst -> mm2 */              \
149           punpcklbw_r2r(mm2, mm2);                              \
150           pand_r2r(mm3, mm2);                                   \
151           psubw_r2r(mm2, mm1);                                  \
152           pmullw_r2r(mm4, mm1);                                 \
153           psrlw_i2r(8, mm1);                                    \
154           paddw_r2r(mm1, mm2);                                  \
155           pand_r2r(mm3, mm2);                                   \
156           packuswb_r2r(mm2, mm2);                               \
157           pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */             \
158           movd_r2m(mm2, *dstp);                                 \
159           ++srcp;                                               \
160           ++dstp;                                               \
161           i--;                                                  \
162         }                                                       \
163         for(; i > 0; --i) {                                     \
164           movq_m2r((*srcp), mm0);                               \
165           movq_r2r(mm0, mm1);                                   \
166           punpcklbw_r2r(mm0, mm0);                              \
167           movq_m2r((*dstp), mm2);                               \
168           punpckhbw_r2r(mm1, mm1);                              \
169           movq_r2r(mm2, mm6);                                   \
170           pand_r2r(mm3, mm0);                                   \
171           punpcklbw_r2r(mm2, mm2);                              \
172           pand_r2r(mm3, mm1);                                   \
173           punpckhbw_r2r(mm6, mm6);                              \
174           pand_r2r(mm3, mm2);                                   \
175           psubw_r2r(mm2, mm0);                                  \
176           pmullw_r2r(mm4, mm0);                                 \
177           pand_r2r(mm3, mm6);                                   \
178           psubw_r2r(mm6, mm1);                                  \
179           pmullw_r2r(mm4, mm1);                                 \
180           psrlw_i2r(8, mm0);                                    \
181           paddw_r2r(mm0, mm2);                                  \
182           psrlw_i2r(8, mm1);                                    \
183           paddw_r2r(mm1, mm6);                                  \
184           pand_r2r(mm3, mm2);                                   \
185           pand_r2r(mm3, mm6);                                   \
186           packuswb_r2r(mm2, mm2);                               \
187           packuswb_r2r(mm6, mm6);                               \
188           psrlq_i2r(32, mm2);                                   \
189           psllq_i2r(32, mm6);                                   \
190           por_r2r(mm6, mm2);                                    \
191           pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */             \
192          movq_r2m(mm2, *dstp);                                  \
193           srcp += 2;                                            \
194           dstp += 2;                                            \
195           i--;                                                  \
196         }                                                       \
197         emms();                                                 \
198     } while(0)
199
200 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)       \
201     do {                                                \
202         int i, n = 0;                                   \
203         Uint16 *srcp = (Uint16 *)(from);                \
204         Uint16 *dstp = (Uint16 *)(to);                  \
205         Uint32 ALPHA = 0xF800;                          \
206         movd_m2r(*(&ALPHA), mm1);                       \
207         punpcklwd_r2r(mm1, mm1);                        \
208         punpcklwd_r2r(mm1, mm1);                        \
209         ALPHA = 0x07E0;                                 \
210         movd_m2r(*(&ALPHA), mm4);                       \
211         punpcklwd_r2r(mm4, mm4);                        \
212         punpcklwd_r2r(mm4, mm4);                        \
213         ALPHA = 0x001F;                                 \
214         movd_m2r(*(&ALPHA), mm7);                       \
215         punpcklwd_r2r(mm7, mm7);                        \
216         punpcklwd_r2r(mm7, mm7);                        \
217         alpha &= ~(1+2+4);                              \
218         i = (Uint32)alpha | (Uint32)alpha << 16;        \
219         movd_m2r(*(&i), mm0);                           \
220         punpckldq_r2r(mm0, mm0);                        \
221         ALPHA = alpha >> 3;                             \
222         i = ((int)(length) & 3);                        \
223         for(; i > 0; --i) {                             \
224             Uint32 s = *srcp++;                         \
225             Uint32 d = *dstp;                           \
226             s = (s | s << 16) & 0x07e0f81f;             \
227             d = (d | d << 16) & 0x07e0f81f;             \
228             d += (s - d) * ALPHA >> 5;                  \
229             d &= 0x07e0f81f;                            \
230             *dstp++ = d | d >> 16;                      \
231             n++;                                        \
232         }                                               \
233         i = (int)(length) - n;                          \
234         for(; i > 0; --i) {                             \
235           movq_m2r((*dstp), mm3);                       \
236           movq_m2r((*srcp), mm2);                       \
237           movq_r2r(mm2, mm5);                           \
238           pand_r2r(mm1 , mm5);                          \
239           psrlq_i2r(11, mm5);                           \
240           movq_r2r(mm3, mm6);                           \
241           pand_r2r(mm1 , mm6);                          \
242           psrlq_i2r(11, mm6);                           \
243           psubw_r2r(mm6, mm5);                          \
244           pmullw_r2r(mm0, mm5);                         \
245           psrlw_i2r(8, mm5);                            \
246           paddw_r2r(mm5, mm6);                          \
247           psllq_i2r(11, mm6);                           \
248           pand_r2r(mm1, mm6);                           \
249           movq_r2r(mm4, mm5);                           \
250           por_r2r(mm7, mm5);                            \
251           pand_r2r(mm5, mm3);                           \
252           por_r2r(mm6, mm3);                            \
253           movq_r2r(mm2, mm5);                           \
254           pand_r2r(mm4 , mm5);                          \
255           psrlq_i2r(5, mm5);                            \
256           movq_r2r(mm3, mm6);                           \
257           pand_r2r(mm4 , mm6);                          \
258           psrlq_i2r(5, mm6);                            \
259           psubw_r2r(mm6, mm5);                          \
260           pmullw_r2r(mm0, mm5);                         \
261           psrlw_i2r(8, mm5);                            \
262           paddw_r2r(mm5, mm6);                          \
263           psllq_i2r(5, mm6);                            \
264           pand_r2r(mm4, mm6);                           \
265           movq_r2r(mm1, mm5);                           \
266           por_r2r(mm7, mm5);                            \
267           pand_r2r(mm5, mm3);                           \
268           por_r2r(mm6, mm3);                            \
269           movq_r2r(mm2, mm5);                           \
270           pand_r2r(mm7 , mm5);                          \
271           movq_r2r(mm3, mm6);                           \
272           pand_r2r(mm7 , mm6);                          \
273           psubw_r2r(mm6, mm5);                          \
274           pmullw_r2r(mm0, mm5);                         \
275           psrlw_i2r(8, mm5);                            \
276           paddw_r2r(mm5, mm6);                          \
277           pand_r2r(mm7, mm6);                           \
278           movq_r2r(mm1, mm5);                           \
279           por_r2r(mm4, mm5);                            \
280           pand_r2r(mm5, mm3);                           \
281           por_r2r(mm6, mm3);                            \
282           movq_r2m(mm3, *dstp);                         \
283           srcp += 4;                                    \
284           dstp += 4;                                    \
285           i -= 3;                                       \
286         }                                               \
287         emms();                                         \
288     } while(0)
289
290 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)       \
291     do {                                                \
292         int i, n = 0;                                   \
293         Uint16 *srcp = (Uint16 *)(from);                \
294         Uint16 *dstp = (Uint16 *)(to);                  \
295         Uint32 ALPHA = 0x7C00;                          \
296         movd_m2r(*(&ALPHA), mm1);                       \
297         punpcklwd_r2r(mm1, mm1);                        \
298         punpcklwd_r2r(mm1, mm1);                        \
299         ALPHA = 0x03E0;                                 \
300         movd_m2r(*(&ALPHA), mm4);                       \
301         punpcklwd_r2r(mm4, mm4);                        \
302         punpcklwd_r2r(mm4, mm4);                        \
303         ALPHA = 0x001F;                                 \
304         movd_m2r(*(&ALPHA), mm7);                       \
305         punpcklwd_r2r(mm7, mm7);                        \
306         punpcklwd_r2r(mm7, mm7);                        \
307         alpha &= ~(1+2+4);                              \
308         i = (Uint32)alpha | (Uint32)alpha << 16;        \
309         movd_m2r(*(&i), mm0);                           \
310         punpckldq_r2r(mm0, mm0);                        \
311         i = ((int)(length) & 3);                                \
312         ALPHA = alpha >> 3;                             \
313         for(; i > 0; --i) {                             \
314             Uint32 s = *srcp++;                         \
315             Uint32 d = *dstp;                           \
316             s = (s | s << 16) & 0x03e07c1f;             \
317             d = (d | d << 16) & 0x03e07c1f;             \
318             d += (s - d) * ALPHA >> 5;                  \
319             d &= 0x03e07c1f;                            \
320             *dstp++ = d | d >> 16;                      \
321             n++;                                        \
322         }                                               \
323         i = (int)(length) - n;                          \
324         for(; i > 0; --i) {                             \
325           movq_m2r((*dstp), mm3);                       \
326           movq_m2r((*srcp), mm2);                       \
327           movq_r2r(mm2, mm5);                           \
328           pand_r2r(mm1 , mm5);                          \
329           psrlq_i2r(10, mm5);                           \
330           movq_r2r(mm3, mm6);                           \
331           pand_r2r(mm1 , mm6);                          \
332           psrlq_i2r(10, mm6);                           \
333           psubw_r2r(mm6, mm5);                          \
334           pmullw_r2r(mm0, mm5);                         \
335           psrlw_i2r(8, mm5);                            \
336           paddw_r2r(mm5, mm6);                          \
337           psllq_i2r(10, mm6);                           \
338           pand_r2r(mm1, mm6);                           \
339           movq_r2r(mm4, mm5);                           \
340           por_r2r(mm7, mm5);                            \
341           pand_r2r(mm5, mm3);                           \
342           por_r2r(mm6, mm3);                            \
343           movq_r2r(mm2, mm5);                           \
344           pand_r2r(mm4 , mm5);                          \
345           psrlq_i2r(5, mm5);                            \
346           movq_r2r(mm3, mm6);                           \
347           pand_r2r(mm4 , mm6);                          \
348           psrlq_i2r(5, mm6);                            \
349           psubw_r2r(mm6, mm5);                          \
350           pmullw_r2r(mm0, mm5);                         \
351           psrlw_i2r(8, mm5);                            \
352           paddw_r2r(mm5, mm6);                          \
353           psllq_i2r(5, mm6);                            \
354           pand_r2r(mm4, mm6);                           \
355           movq_r2r(mm1, mm5);                           \
356           por_r2r(mm7, mm5);                            \
357           pand_r2r(mm5, mm3);                           \
358           por_r2r(mm6, mm3);                            \
359           movq_r2r(mm2, mm5);                           \
360           pand_r2r(mm7 , mm5);                          \
361           movq_r2r(mm3, mm6);                           \
362           pand_r2r(mm7 , mm6);                          \
363           psubw_r2r(mm6, mm5);                          \
364           pmullw_r2r(mm0, mm5);                         \
365           psrlw_i2r(8, mm5);                            \
366           paddw_r2r(mm5, mm6);                          \
367           pand_r2r(mm7, mm6);                           \
368           movq_r2r(mm1, mm5);                           \
369           por_r2r(mm4, mm5);                            \
370           pand_r2r(mm5, mm3);                           \
371           por_r2r(mm6, mm3);                            \
372           movq_r2m(mm3, *dstp);                         \
373           srcp += 4;                                    \
374           dstp += 4;                                    \
375           i -= 3;                                       \
376         }                                               \
377         emms();                                         \
378     } while(0)
379
380 #endif
381
382 /*
383  * For 32bpp pixels on the form 0x00rrggbb:
384  * If we treat the middle component separately, we can process the two
385  * remaining in parallel. This is safe to do because of the gap to the left
386  * of each component, so the bits from the multiplication don't collide.
387  * This can be used for any RGB permutation of course.
388  */
389 #define ALPHA_BLIT32_888(to, from, length, bpp, alpha)          \
390     do {                                                        \
391         int i;                                                  \
392         Uint32 *src = (Uint32 *)(from);                         \
393         Uint32 *dst = (Uint32 *)(to);                           \
394         for(i = 0; i < (int)(length); i++) {                    \
395             Uint32 s = *src++;                                  \
396             Uint32 d = *dst;                                    \
397             Uint32 s1 = s & 0xff00ff;                           \
398             Uint32 d1 = d & 0xff00ff;                           \
399             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;    \
400             s &= 0xff00;                                        \
401             d &= 0xff00;                                        \
402             d = (d + ((s - d) * alpha >> 8)) & 0xff00;          \
403             *dst++ = d1 | d;                                    \
404         }                                                       \
405     } while(0)
406
407 /*
408  * For 16bpp pixels we can go a step further: put the middle component
409  * in the high 16 bits of a 32 bit word, and process all three RGB
410  * components at the same time. Since the smallest gap is here just
411  * 5 bits, we have to scale alpha down to 5 bits as well.
412  */
413 #define ALPHA_BLIT16_565(to, from, length, bpp, alpha)  \
414     do {                                                \
415         int i;                                          \
416         Uint16 *src = (Uint16 *)(from);                 \
417         Uint16 *dst = (Uint16 *)(to);                   \
418         Uint32 ALPHA = alpha >> 3;                      \
419         for(i = 0; i < (int)(length); i++) {            \
420             Uint32 s = *src++;                          \
421             Uint32 d = *dst;                            \
422             s = (s | s << 16) & 0x07e0f81f;             \
423             d = (d | d << 16) & 0x07e0f81f;             \
424             d += (s - d) * ALPHA >> 5;                  \
425             d &= 0x07e0f81f;                            \
426             *dst++ = (Uint16)(d | d >> 16);                     \
427         }                                               \
428     } while(0)
429
430 #define ALPHA_BLIT16_555(to, from, length, bpp, alpha)  \
431     do {                                                \
432         int i;                                          \
433         Uint16 *src = (Uint16 *)(from);                 \
434         Uint16 *dst = (Uint16 *)(to);                   \
435         Uint32 ALPHA = alpha >> 3;                      \
436         for(i = 0; i < (int)(length); i++) {            \
437             Uint32 s = *src++;                          \
438             Uint32 d = *dst;                            \
439             s = (s | s << 16) & 0x03e07c1f;             \
440             d = (d | d << 16) & 0x03e07c1f;             \
441             d += (s - d) * ALPHA >> 5;                  \
442             d &= 0x03e07c1f;                            \
443             *dst++ = (Uint16)(d | d >> 16);                     \
444         }                                               \
445     } while(0)
446
447 /*
448  * The general slow catch-all function, for remaining depths and formats
449  */
450 #define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)                    \
451     do {                                                                \
452         int i;                                                          \
453         Uint8 *src = from;                                              \
454         Uint8 *dst = to;                                                \
455         for(i = 0; i < (int)(length); i++) {                            \
456             Uint32 s, d;                                                \
457             unsigned rs, gs, bs, rd, gd, bd;                            \
458             switch(bpp) {                                               \
459             case 2:                                                     \
460                 s = *(Uint16 *)src;                                     \
461                 d = *(Uint16 *)dst;                                     \
462                 break;                                                  \
463             case 3:                                                     \
464                 if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {                   \
465                     s = (src[0] << 16) | (src[1] << 8) | src[2];        \
466                     d = (dst[0] << 16) | (dst[1] << 8) | dst[2];        \
467                 } else {                                                \
468                     s = (src[2] << 16) | (src[1] << 8) | src[0];        \
469                     d = (dst[2] << 16) | (dst[1] << 8) | dst[0];        \
470                 }                                                       \
471                 break;                                                  \
472             case 4:                                                     \
473                 s = *(Uint32 *)src;                                     \
474                 d = *(Uint32 *)dst;                                     \
475                 break;                                                  \
476             }                                                           \
477             RGB_FROM_PIXEL(s, fmt, rs, gs, bs);                         \
478             RGB_FROM_PIXEL(d, fmt, rd, gd, bd);                         \
479             rd += (rs - rd) * alpha >> 8;                               \
480             gd += (gs - gd) * alpha >> 8;                               \
481             bd += (bs - bd) * alpha >> 8;                               \
482             PIXEL_FROM_RGB(d, fmt, rd, gd, bd);                         \
483             switch(bpp) {                                               \
484             case 2:                                                     \
485                 *(Uint16 *)dst = (Uint16)d;                                     \
486                 break;                                                  \
487             case 3:                                                     \
488                 if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {                   \
489                     dst[0] = (Uint8)(d >> 16);                                  \
490                     dst[1] = (Uint8)(d >> 8);                                   \
491                     dst[2] = (Uint8)(d);                                                \
492                 } else {                                                \
493                     dst[0] = (Uint8)d;                                          \
494                     dst[1] = (Uint8)(d >> 8);                                   \
495                     dst[2] = (Uint8)(d >> 16);                                  \
496                 }                                                       \
497                 break;                                                  \
498             case 4:                                                     \
499                 *(Uint32 *)dst = d;                                     \
500                 break;                                                  \
501             }                                                           \
502             src += bpp;                                                 \
503             dst += bpp;                                                 \
504         }                                                               \
505     } while(0)
506
507 #ifdef MMX_ASMBLIT
508
509 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)            \
510     do {                                                                \
511         Uint32 *srcp = (Uint32 *)(from);                                \
512         Uint32 *dstp = (Uint32 *)(to);                                  \
513         int i = 0x00fefefe;                                             \
514         movd_m2r(*(&i), mm4);                                           \
515         punpckldq_r2r(mm4, mm4);                                        \
516         i = 0x00010101;                                                 \
517         movd_m2r(*(&i), mm3);                                           \
518         punpckldq_r2r(mm3, mm3);                                        \
519         i = (int)(length);                                              \
520         if( i & 1 ) {                                                   \
521           Uint32 s = *srcp++;                                           \
522           Uint32 d = *dstp;                                             \
523           *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)        \
524                      + (s & d & 0x00010101);                            \
525           i--;                                                          \
526         }                                                               \
527         for(; i > 0; --i) {                                             \
528             movq_m2r((*dstp), mm2); /* dst -> mm2 */                    \
529             movq_r2r(mm2, mm6); /* dst -> mm6 */                        \
530             movq_m2r((*srcp), mm1); /* src -> mm1 */                    \
531             movq_r2r(mm1, mm5); /* src -> mm5 */                        \
532             pand_r2r(mm4, mm6); /* dst & 0x00fefefe -> mm6 */           \
533             pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */           \
534             paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */   \
535             psrld_i2r(1, mm5);                                          \
536             pand_r2r(mm1, mm2); /* s & d -> mm2 */                      \
537             pand_r2r(mm3, mm2); /* s & d & 0x00010101 -> mm2 */         \
538             paddd_r2r(mm5, mm2);                                        \
539             movq_r2m(mm2, (*dstp));                                     \
540             dstp += 2;                                                  \
541             srcp += 2;                                                  \
542             i--;                                                        \
543         }                                                               \
544         emms();                                                         \
545     } while(0)
546
547 #endif
548     
549 /*
550  * Special case: 50% alpha (alpha=128)
551  * This is treated specially because it can be optimized very well, and
552  * since it is good for many cases of semi-translucency.
553  * The theory is to do all three components at the same time:
554  * First zero the lowest bit of each component, which gives us room to
555  * add them. Then shift right and add the sum of the lowest bits.
556  */
557 #define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)               \
558     do {                                                                \
559         int i;                                                          \
560         Uint32 *src = (Uint32 *)(from);                                 \
561         Uint32 *dst = (Uint32 *)(to);                                   \
562         for(i = 0; i < (int)(length); i++) {                            \
563             Uint32 s = *src++;                                          \
564             Uint32 d = *dst;                                            \
565             *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)       \
566                      + (s & d & 0x00010101);                            \
567         }                                                               \
568     } while(0)
569
570 /*
571  * For 16bpp, we can actually blend two pixels in parallel, if we take
572  * care to shift before we add, not after.
573  */
574
575 /* helper: blend a single 16 bit pixel at 50% */
576 #define BLEND16_50(dst, src, mask)                      \
577     do {                                                \
578         Uint32 s = *src++;                              \
579         Uint32 d = *dst;                                \
580         *dst++ = (Uint16)((((s & mask) + (d & mask)) >> 1) +    \
581                           (s & d & (~mask & 0xffff)));          \
582     } while(0)
583
584 /* basic 16bpp blender. mask is the pixels to keep when adding. */
585 #define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)             \
586     do {                                                                \
587         unsigned n = (length);                                          \
588         Uint16 *src = (Uint16 *)(from);                                 \
589         Uint16 *dst = (Uint16 *)(to);                                   \
590         if(((uintptr_t)src ^ (uintptr_t)dst) & 3) {                     \
591             /* source and destination not in phase, blit one by one */  \
592             while(n--)                                                  \
593                 BLEND16_50(dst, src, mask);                             \
594         } else {                                                        \
595             if((uintptr_t)src & 3) {                                    \
596                 /* first odd pixel */                                   \
597                 BLEND16_50(dst, src, mask);                             \
598                 n--;                                                    \
599             }                                                           \
600             for(; n > 1; n -= 2) {                                      \
601                 Uint32 s = *(Uint32 *)src;                              \
602                 Uint32 d = *(Uint32 *)dst;                              \
603                 *(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)       \
604                                + ((d & (mask | mask << 16)) >> 1)       \
605                                + (s & d & (~(mask | mask << 16)));      \
606                 src += 2;                                               \
607                 dst += 2;                                               \
608             }                                                           \
609             if(n)                                                       \
610                 BLEND16_50(dst, src, mask); /* last odd pixel */        \
611         }                                                               \
612     } while(0)
613
614 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)       \
615     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
616
617 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)       \
618     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
619
620 #ifdef MMX_ASMBLIT
621
622 #define CHOOSE_BLIT(blitter, alpha, fmt)                                \
623     do {                                                                \
624         if(alpha == 255) {                                              \
625             switch(fmt->BytesPerPixel) {                                \
626             case 1: blitter(1, Uint8, OPAQUE_BLIT); break;              \
627             case 2: blitter(2, Uint8, OPAQUE_BLIT); break;              \
628             case 3: blitter(3, Uint8, OPAQUE_BLIT); break;              \
629             case 4: blitter(4, Uint16, OPAQUE_BLIT); break;             \
630             }                                                           \
631         } else {                                                        \
632             switch(fmt->BytesPerPixel) {                                \
633             case 1:                                                     \
634                 /* No 8bpp alpha blitting */                            \
635                 break;                                                  \
636                                                                         \
637             case 2:                                                     \
638                 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {          \
639                 case 0xffff:                                            \
640                     if(fmt->Gmask == 0x07e0                             \
641                        || fmt->Rmask == 0x07e0                          \
642                        || fmt->Bmask == 0x07e0) {                       \
643                         if(alpha == 128)                                \
644                             blitter(2, Uint8, ALPHA_BLIT16_565_50);     \
645                         else {                                          \
646                             if(SDL_HasMMX())                            \
647                                 blitter(2, Uint8, ALPHA_BLIT16_565MMX); \
648                             else                                        \
649                                 blitter(2, Uint8, ALPHA_BLIT16_565);    \
650                         }                                               \
651                     } else                                              \
652                         goto general16;                                 \
653                     break;                                              \
654                                                                         \
655                 case 0x7fff:                                            \
656                     if(fmt->Gmask == 0x03e0                             \
657                        || fmt->Rmask == 0x03e0                          \
658                        || fmt->Bmask == 0x03e0) {                       \
659                         if(alpha == 128)                                \
660                             blitter(2, Uint8, ALPHA_BLIT16_555_50);     \
661                         else {                                          \
662                             if(SDL_HasMMX())                            \
663                                 blitter(2, Uint8, ALPHA_BLIT16_555MMX); \
664                             else                                        \
665                                 blitter(2, Uint8, ALPHA_BLIT16_555);    \
666                         }                                               \
667                         break;                                          \
668                     }                                                   \
669                     /* fallthrough */                                   \
670                                                                         \
671                 default:                                                \
672                 general16:                                              \
673                     blitter(2, Uint8, ALPHA_BLIT_ANY);                  \
674                 }                                                       \
675                 break;                                                  \
676                                                                         \
677             case 3:                                                     \
678                 blitter(3, Uint8, ALPHA_BLIT_ANY);                      \
679                 break;                                                  \
680                                                                         \
681             case 4:                                                     \
682                 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \
683                    && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00     \
684                        || fmt->Bmask == 0xff00)) {                      \
685                     if(alpha == 128)                                    \
686                     {                                                   \
687                         if(SDL_HasMMX())                                \
688                                 blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
689                         else                                            \
690                                 blitter(4, Uint16, ALPHA_BLIT32_888_50);\
691                     }                                                   \
692                     else                                                \
693                     {                                                   \
694                         if(SDL_HasMMX())                                \
695                                 blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
696                         else                                            \
697                                 blitter(4, Uint16, ALPHA_BLIT32_888);   \
698                     }                                                   \
699                 } else                                                  \
700                     blitter(4, Uint16, ALPHA_BLIT_ANY);                 \
701                 break;                                                  \
702             }                                                           \
703         }                                                               \
704     } while(0)
705
706 #else
707         
708 #define CHOOSE_BLIT(blitter, alpha, fmt)                                \
709     do {                                                                \
710         if(alpha == 255) {                                              \
711             switch(fmt->BytesPerPixel) {                                \
712             case 1: blitter(1, Uint8, OPAQUE_BLIT); break;              \
713             case 2: blitter(2, Uint8, OPAQUE_BLIT); break;              \
714             case 3: blitter(3, Uint8, OPAQUE_BLIT); break;              \
715             case 4: blitter(4, Uint16, OPAQUE_BLIT); break;             \
716             }                                                           \
717         } else {                                                        \
718             switch(fmt->BytesPerPixel) {                                \
719             case 1:                                                     \
720                 /* No 8bpp alpha blitting */                            \
721                 break;                                                  \
722                                                                         \
723             case 2:                                                     \
724                 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {          \
725                 case 0xffff:                                            \
726                     if(fmt->Gmask == 0x07e0                             \
727                        || fmt->Rmask == 0x07e0                          \
728                        || fmt->Bmask == 0x07e0) {                       \
729                         if(alpha == 128)                                \
730                             blitter(2, Uint8, ALPHA_BLIT16_565_50);     \
731                         else {                                          \
732                             blitter(2, Uint8, ALPHA_BLIT16_565);        \
733                         }                                               \
734                     } else                                              \
735                         goto general16;                                 \
736                     break;                                              \
737                                                                         \
738                 case 0x7fff:                                            \
739                     if(fmt->Gmask == 0x03e0                             \
740                        || fmt->Rmask == 0x03e0                          \
741                        || fmt->Bmask == 0x03e0) {                       \
742                         if(alpha == 128)                                \
743                             blitter(2, Uint8, ALPHA_BLIT16_555_50);     \
744                         else {                                          \
745                             blitter(2, Uint8, ALPHA_BLIT16_555);        \
746                         }                                               \
747                         break;                                          \
748                     }                                                   \
749                     /* fallthrough */                                   \
750                                                                         \
751                 default:                                                \
752                 general16:                                              \
753                     blitter(2, Uint8, ALPHA_BLIT_ANY);                  \
754                 }                                                       \
755                 break;                                                  \
756                                                                         \
757             case 3:                                                     \
758                 blitter(3, Uint8, ALPHA_BLIT_ANY);                      \
759                 break;                                                  \
760                                                                         \
761             case 4:                                                     \
762                 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \
763                    && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00     \
764                        || fmt->Bmask == 0xff00)) {                      \
765                     if(alpha == 128)                                    \
766                         blitter(4, Uint16, ALPHA_BLIT32_888_50);        \
767                     else                                                \
768                         blitter(4, Uint16, ALPHA_BLIT32_888);           \
769                 } else                                                  \
770                     blitter(4, Uint16, ALPHA_BLIT_ANY);                 \
771                 break;                                                  \
772             }                                                           \
773         }                                                               \
774     } while(0)
775
776 #endif
777
778 /*
779  * This takes care of the case when the surface is clipped on the left and/or
780  * right. Top clipping has already been taken care of.
781  */
782 static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
783                         Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
784 {
785     SDL_PixelFormat *fmt = dst->format;
786
787 #define RLECLIPBLIT(bpp, Type, do_blit)                                    \
788     do {                                                                   \
789         int linecount = srcrect->h;                                        \
790         int ofs = 0;                                                       \
791         int left = srcrect->x;                                             \
792         int right = left + srcrect->w;                                     \
793         dstbuf -= left * bpp;                                              \
794         for(;;) {                                                          \
795             int run;                                                       \
796             ofs += *(Type *)srcbuf;                                        \
797             run = ((Type *)srcbuf)[1];                                     \
798             srcbuf += 2 * sizeof(Type);                                    \
799             if(run) {                                                      \
800                 /* clip to left and right borders */                       \
801                 if(ofs < right) {                                          \
802                     int start = 0;                                         \
803                     int len = run;                                         \
804                     int startcol;                                          \
805                     if(left - ofs > 0) {                                   \
806                         start = left - ofs;                                \
807                         len -= start;                                      \
808                         if(len <= 0)                                       \
809                             goto nocopy ## bpp ## do_blit;                 \
810                     }                                                      \
811                     startcol = ofs + start;                                \
812                     if(len > right - startcol)                             \
813                         len = right - startcol;                            \
814                     do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
815                             len, bpp, alpha);                              \
816                 }                                                          \
817             nocopy ## bpp ## do_blit:                                      \
818                 srcbuf += run * bpp;                                       \
819                 ofs += run;                                                \
820             } else if(!ofs)                                                \
821                 break;                                                     \
822             if(ofs == w) {                                                 \
823                 ofs = 0;                                                   \
824                 dstbuf += dst->pitch;                                      \
825                 if(!--linecount)                                           \
826                     break;                                                 \
827             }                                                              \
828         }                                                                  \
829     } while(0)
830
831     CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
832
833 #undef RLECLIPBLIT
834
835 }
836
837
838 /* blit a colorkeyed RLE surface */
839 int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
840                 SDL_Surface *dst, SDL_Rect *dstrect)
841 {
842         Uint8 *dstbuf;
843         Uint8 *srcbuf;
844         int x, y;
845         int w = src->w;
846         unsigned alpha;
847
848         /* Lock the destination if necessary */
849         if ( SDL_MUSTLOCK(dst) ) {
850                 if ( SDL_LockSurface(dst) < 0 ) {
851                         return(-1);
852                 }
853         }
854
855         /* Set up the source and destination pointers */
856         x = dstrect->x;
857         y = dstrect->y;
858         dstbuf = (Uint8 *)dst->pixels
859                  + y * dst->pitch + x * src->format->BytesPerPixel;
860         srcbuf = (Uint8 *)src->map->sw_data->aux_data;
861
862         {
863             /* skip lines at the top if neccessary */
864             int vskip = srcrect->y;
865             int ofs = 0;
866             if(vskip) {
867
868 #define RLESKIP(bpp, Type)                      \
869                 for(;;) {                       \
870                     int run;                    \
871                     ofs += *(Type *)srcbuf;     \
872                     run = ((Type *)srcbuf)[1];  \
873                     srcbuf += sizeof(Type) * 2; \
874                     if(run) {                   \
875                         srcbuf += run * bpp;    \
876                         ofs += run;             \
877                     } else if(!ofs)             \
878                         goto done;              \
879                     if(ofs == w) {              \
880                         ofs = 0;                \
881                         if(!--vskip)            \
882                             break;              \
883                     }                           \
884                 }
885
886                 switch(src->format->BytesPerPixel) {
887                 case 1: RLESKIP(1, Uint8); break;
888                 case 2: RLESKIP(2, Uint8); break;
889                 case 3: RLESKIP(3, Uint8); break;
890                 case 4: RLESKIP(4, Uint16); break;
891                 }
892
893 #undef RLESKIP
894
895             }
896         }
897
898         alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
899                 ? src->format->alpha : 255;
900         /* if left or right edge clipping needed, call clip blit */
901         if ( srcrect->x || srcrect->w != src->w ) {
902             RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
903         } else {
904             SDL_PixelFormat *fmt = src->format;
905
906 #define RLEBLIT(bpp, Type, do_blit)                                           \
907             do {                                                              \
908                 int linecount = srcrect->h;                                   \
909                 int ofs = 0;                                                  \
910                 for(;;) {                                                     \
911                     unsigned run;                                             \
912                     ofs += *(Type *)srcbuf;                                   \
913                     run = ((Type *)srcbuf)[1];                                \
914                     srcbuf += 2 * sizeof(Type);                               \
915                     if(run) {                                                 \
916                         do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
917                         srcbuf += run * bpp;                                  \
918                         ofs += run;                                           \
919                     } else if(!ofs)                                           \
920                         break;                                                \
921                     if(ofs == w) {                                            \
922                         ofs = 0;                                              \
923                         dstbuf += dst->pitch;                                 \
924                         if(!--linecount)                                      \
925                             break;                                            \
926                     }                                                         \
927                 }                                                             \
928             } while(0)
929
930             CHOOSE_BLIT(RLEBLIT, alpha, fmt);
931
932 #undef RLEBLIT
933         }
934
935 done:
936         /* Unlock the destination if necessary */
937         if ( SDL_MUSTLOCK(dst) ) {
938                 SDL_UnlockSurface(dst);
939         }
940         return(0);
941 }
942
943 #undef OPAQUE_BLIT
944
945 /*
946  * Per-pixel blitting macros for translucent pixels:
947  * These use the same techniques as the per-surface blitting macros
948  */
949
950 /*
951  * For 32bpp pixels, we have made sure the alpha is stored in the top
952  * 8 bits, so proceed as usual
953  */
954 #define BLIT_TRANSL_888(src, dst)                               \
955     do {                                                        \
956         Uint32 s = src;                                         \
957         Uint32 d = dst;                                         \
958         unsigned alpha = s >> 24;                               \
959         Uint32 s1 = s & 0xff00ff;                               \
960         Uint32 d1 = d & 0xff00ff;                               \
961         d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;        \
962         s &= 0xff00;                                            \
963         d &= 0xff00;                                            \
964         d = (d + ((s - d) * alpha >> 8)) & 0xff00;              \
965         dst = d1 | d;                                           \
966     } while(0)
967
968 /*
969  * For 16bpp pixels, we have stored the 5 most significant alpha bits in
970  * bits 5-10. As before, we can process all 3 RGB components at the same time.
971  */
972 #define BLIT_TRANSL_565(src, dst)               \
973     do {                                        \
974         Uint32 s = src;                         \
975         Uint32 d = dst;                         \
976         unsigned alpha = (s & 0x3e0) >> 5;      \
977         s &= 0x07e0f81f;                        \
978         d = (d | d << 16) & 0x07e0f81f;         \
979         d += (s - d) * alpha >> 5;              \
980         d &= 0x07e0f81f;                        \
981         dst = (Uint16)(d | d >> 16);                    \
982     } while(0)
983
984 #define BLIT_TRANSL_555(src, dst)               \
985     do {                                        \
986         Uint32 s = src;                         \
987         Uint32 d = dst;                         \
988         unsigned alpha = (s & 0x3e0) >> 5;      \
989         s &= 0x03e07c1f;                        \
990         d = (d | d << 16) & 0x03e07c1f;         \
991         d += (s - d) * alpha >> 5;              \
992         d &= 0x03e07c1f;                        \
993         dst = (Uint16)(d | d >> 16);                    \
994     } while(0)
995
996 /* used to save the destination format in the encoding. Designed to be
997    macro-compatible with SDL_PixelFormat but without the unneeded fields */
998 typedef struct {
999         Uint8  BytesPerPixel;
1000         Uint8  Rloss;
1001         Uint8  Gloss;
1002         Uint8  Bloss;
1003         Uint8  Rshift;
1004         Uint8  Gshift;
1005         Uint8  Bshift;
1006         Uint8  Ashift;
1007         Uint32 Rmask;
1008         Uint32 Gmask;
1009         Uint32 Bmask;
1010         Uint32 Amask;
1011 } RLEDestFormat;
1012
1013 /* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
1014 static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
1015                              Uint8 *dstbuf, SDL_Rect *srcrect)
1016 {
1017     SDL_PixelFormat *df = dst->format;
1018     /*
1019      * clipped blitter: Ptype is the destination pixel type,
1020      * Ctype the translucent count type, and do_blend the macro
1021      * to blend one pixel.
1022      */
1023 #define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)                          \
1024     do {                                                                  \
1025         int linecount = srcrect->h;                                       \
1026         int left = srcrect->x;                                            \
1027         int right = left + srcrect->w;                                    \
1028         dstbuf -= left * sizeof(Ptype);                                   \
1029         do {                                                              \
1030             int ofs = 0;                                                  \
1031             /* blit opaque pixels on one line */                          \
1032             do {                                                          \
1033                 unsigned run;                                             \
1034                 ofs += ((Ctype *)srcbuf)[0];                              \
1035                 run = ((Ctype *)srcbuf)[1];                               \
1036                 srcbuf += 2 * sizeof(Ctype);                              \
1037                 if(run) {                                                 \
1038                     /* clip to left and right borders */                  \
1039                     int cofs = ofs;                                       \
1040                     int crun = run;                                       \
1041                     if(left - cofs > 0) {                                 \
1042                         crun -= left - cofs;                              \
1043                         cofs = left;                                      \
1044                     }                                                     \
1045                     if(crun > right - cofs)                               \
1046                         crun = right - cofs;                              \
1047                     if(crun > 0)                                          \
1048                         PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),         \
1049                                    srcbuf + (cofs - ofs) * sizeof(Ptype), \
1050                                    (unsigned)crun, sizeof(Ptype));        \
1051                     srcbuf += run * sizeof(Ptype);                        \
1052                     ofs += run;                                           \
1053                 } else if(!ofs)                                           \
1054                     return;                                               \
1055             } while(ofs < w);                                             \
1056             /* skip padding if necessary */                               \
1057             if(sizeof(Ptype) == 2)                                        \
1058                 srcbuf += (uintptr_t)srcbuf & 2;                          \
1059             /* blit translucent pixels on the same line */                \
1060             ofs = 0;                                                      \
1061             do {                                                          \
1062                 unsigned run;                                             \
1063                 ofs += ((Uint16 *)srcbuf)[0];                             \
1064                 run = ((Uint16 *)srcbuf)[1];                              \
1065                 srcbuf += 4;                                              \
1066                 if(run) {                                                 \
1067                     /* clip to left and right borders */                  \
1068                     int cofs = ofs;                                       \
1069                     int crun = run;                                       \
1070                     if(left - cofs > 0) {                                 \
1071                         crun -= left - cofs;                              \
1072                         cofs = left;                                      \
1073                     }                                                     \
1074                     if(crun > right - cofs)                               \
1075                         crun = right - cofs;                              \
1076                     if(crun > 0) {                                        \
1077                         Ptype *dst = (Ptype *)dstbuf + cofs;              \
1078                         Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);    \
1079                         int i;                                            \
1080                         for(i = 0; i < crun; i++)                         \
1081                             do_blend(src[i], dst[i]);                     \
1082                     }                                                     \
1083                     srcbuf += run * 4;                                    \
1084                     ofs += run;                                           \
1085                 }                                                         \
1086             } while(ofs < w);                                             \
1087             dstbuf += dst->pitch;                                         \
1088         } while(--linecount);                                             \
1089     } while(0)
1090
1091     switch(df->BytesPerPixel) {
1092     case 2:
1093         if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1094            || df->Bmask == 0x07e0)
1095             RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
1096         else
1097             RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
1098         break;
1099     case 4:
1100         RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
1101         break;
1102     }
1103 }
1104
1105 /* blit a pixel-alpha RLE surface */
1106 int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
1107                      SDL_Surface *dst, SDL_Rect *dstrect)
1108 {
1109     int x, y;
1110     int w = src->w;
1111     Uint8 *srcbuf, *dstbuf;
1112     SDL_PixelFormat *df = dst->format;
1113
1114     /* Lock the destination if necessary */
1115     if ( SDL_MUSTLOCK(dst) ) {
1116         if ( SDL_LockSurface(dst) < 0 ) {
1117             return -1;
1118         }
1119     }
1120
1121     x = dstrect->x;
1122     y = dstrect->y;
1123     dstbuf = (Uint8 *)dst->pixels
1124              + y * dst->pitch + x * df->BytesPerPixel;
1125     srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
1126
1127     {
1128         /* skip lines at the top if necessary */
1129         int vskip = srcrect->y;
1130         if(vskip) {
1131             int ofs;
1132             if(df->BytesPerPixel == 2) {
1133                 /* the 16/32 interleaved format */
1134                 do {
1135                     /* skip opaque line */
1136                     ofs = 0;
1137                     do {
1138                         int run;
1139                         ofs += srcbuf[0];
1140                         run = srcbuf[1];
1141                         srcbuf += 2;
1142                         if(run) {
1143                             srcbuf += 2 * run;
1144                             ofs += run;
1145                         } else if(!ofs)
1146                             goto done;
1147                     } while(ofs < w);
1148
1149                     /* skip padding */
1150                     srcbuf += (uintptr_t)srcbuf & 2;
1151
1152                     /* skip translucent line */
1153                     ofs = 0;
1154                     do {
1155                         int run;
1156                         ofs += ((Uint16 *)srcbuf)[0];
1157                         run = ((Uint16 *)srcbuf)[1];
1158                         srcbuf += 4 * (run + 1);
1159                         ofs += run;
1160                     } while(ofs < w);
1161                 } while(--vskip);
1162             } else {
1163                 /* the 32/32 interleaved format */
1164                 vskip <<= 1;    /* opaque and translucent have same format */
1165                 do {
1166                     ofs = 0;
1167                     do {
1168                         int run;
1169                         ofs += ((Uint16 *)srcbuf)[0];
1170                         run = ((Uint16 *)srcbuf)[1];
1171                         srcbuf += 4;
1172                         if(run) {
1173                             srcbuf += 4 * run;
1174                             ofs += run;
1175                         } else if(!ofs)
1176                             goto done;
1177                     } while(ofs < w);
1178                 } while(--vskip);
1179             }
1180         }
1181     }
1182
1183     /* if left or right edge clipping needed, call clip blit */
1184     if(srcrect->x || srcrect->w != src->w) {
1185         RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
1186     } else {
1187
1188         /*
1189          * non-clipped blitter. Ptype is the destination pixel type,
1190          * Ctype the translucent count type, and do_blend the
1191          * macro to blend one pixel.
1192          */
1193 #define RLEALPHABLIT(Ptype, Ctype, do_blend)                             \
1194         do {                                                             \
1195             int linecount = srcrect->h;                                  \
1196             do {                                                         \
1197                 int ofs = 0;                                             \
1198                 /* blit opaque pixels on one line */                     \
1199                 do {                                                     \
1200                     unsigned run;                                        \
1201                     ofs += ((Ctype *)srcbuf)[0];                         \
1202                     run = ((Ctype *)srcbuf)[1];                          \
1203                     srcbuf += 2 * sizeof(Ctype);                         \
1204                     if(run) {                                            \
1205                         PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
1206                                    run, sizeof(Ptype));                  \
1207                         srcbuf += run * sizeof(Ptype);                   \
1208                         ofs += run;                                      \
1209                     } else if(!ofs)                                      \
1210                         goto done;                                       \
1211                 } while(ofs < w);                                        \
1212                 /* skip padding if necessary */                          \
1213                 if(sizeof(Ptype) == 2)                                   \
1214                     srcbuf += (uintptr_t)srcbuf & 2;                     \
1215                 /* blit translucent pixels on the same line */           \
1216                 ofs = 0;                                                 \
1217                 do {                                                     \
1218                     unsigned run;                                        \
1219                     ofs += ((Uint16 *)srcbuf)[0];                        \
1220                     run = ((Uint16 *)srcbuf)[1];                         \
1221                     srcbuf += 4;                                         \
1222                     if(run) {                                            \
1223                         Ptype *dst = (Ptype *)dstbuf + ofs;              \
1224                         unsigned i;                                      \
1225                         for(i = 0; i < run; i++) {                       \
1226                             Uint32 src = *(Uint32 *)srcbuf;              \
1227                             do_blend(src, *dst);                         \
1228                             srcbuf += 4;                                 \
1229                             dst++;                                       \
1230                         }                                                \
1231                         ofs += run;                                      \
1232                     }                                                    \
1233                 } while(ofs < w);                                        \
1234                 dstbuf += dst->pitch;                                    \
1235             } while(--linecount);                                        \
1236         } while(0)
1237
1238         switch(df->BytesPerPixel) {
1239         case 2:
1240             if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1241                || df->Bmask == 0x07e0)
1242                 RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
1243             else
1244                 RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
1245             break;
1246         case 4:
1247             RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
1248             break;
1249         }
1250     }
1251
1252  done:
1253     /* Unlock the destination if necessary */
1254     if ( SDL_MUSTLOCK(dst) ) {
1255         SDL_UnlockSurface(dst);
1256     }
1257     return 0;
1258 }
1259
1260 /*
1261  * Auxiliary functions:
1262  * The encoding functions take 32bpp rgb + a, and
1263  * return the number of bytes copied to the destination.
1264  * The decoding functions copy to 32bpp rgb + a, and
1265  * return the number of bytes copied from the source.
1266  * These are only used in the encoder and un-RLE code and are therefore not
1267  * highly optimised.
1268  */
1269
1270 /* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
1271 static int copy_opaque_16(void *dst, Uint32 *src, int n,
1272                           SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1273 {
1274     int i;
1275     Uint16 *d = dst;
1276     for(i = 0; i < n; i++) {
1277         unsigned r, g, b;
1278         RGB_FROM_PIXEL(*src, sfmt, r, g, b);
1279         PIXEL_FROM_RGB(*d, dfmt, r, g, b);
1280         src++;
1281         d++;
1282     }
1283     return n * 2;
1284 }
1285
1286 /* decode opaque pixels from 16bpp to 32bpp rgb + a */
1287 static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
1288                             RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1289 {
1290     int i;
1291     Uint16 *s = src;
1292     unsigned alpha = dfmt->Amask ? 255 : 0;
1293     for(i = 0; i < n; i++) {
1294         unsigned r, g, b;
1295         RGB_FROM_PIXEL(*s, sfmt, r, g, b);
1296         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
1297         s++;
1298         dst++;
1299     }
1300     return n * 2;
1301 }
1302
1303
1304
1305 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
1306 static int copy_transl_565(void *dst, Uint32 *src, int n,
1307                            SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1308 {
1309     int i;
1310     Uint32 *d = dst;
1311     for(i = 0; i < n; i++) {
1312         unsigned r, g, b, a;
1313         Uint16 pix;
1314         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1315         PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1316         *d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
1317         src++;
1318         d++;
1319     }
1320     return n * 4;
1321 }
1322
1323 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
1324 static int copy_transl_555(void *dst, Uint32 *src, int n,
1325                            SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1326 {
1327     int i;
1328     Uint32 *d = dst;
1329     for(i = 0; i < n; i++) {
1330         unsigned r, g, b, a;
1331         Uint16 pix;
1332         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1333         PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1334         *d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
1335         src++;
1336         d++;
1337     }
1338     return n * 4;
1339 }
1340
1341 /* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
1342 static int uncopy_transl_16(Uint32 *dst, void *src, int n,
1343                             RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1344 {
1345     int i;
1346     Uint32 *s = src;
1347     for(i = 0; i < n; i++) {
1348         unsigned r, g, b, a;
1349         Uint32 pix = *s++;
1350         a = (pix & 0x3e0) >> 2;
1351         pix = (pix & ~0x3e0) | pix >> 16;
1352         RGB_FROM_PIXEL(pix, sfmt, r, g, b);
1353         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1354         dst++;
1355     }
1356     return n * 4;
1357 }
1358
1359 /* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1360 static int copy_32(void *dst, Uint32 *src, int n,
1361                    SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1362 {
1363     int i;
1364     Uint32 *d = dst;
1365     for(i = 0; i < n; i++) {
1366         unsigned r, g, b, a;
1367         Uint32 pixel;
1368         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1369         PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
1370         *d++ = pixel | a << 24;
1371         src++;
1372     }
1373     return n * 4;
1374 }
1375
1376 /* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1377 static int uncopy_32(Uint32 *dst, void *src, int n,
1378                      RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1379 {
1380     int i;
1381     Uint32 *s = src;
1382     for(i = 0; i < n; i++) {
1383         unsigned r, g, b, a;
1384         Uint32 pixel = *s++;
1385         RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
1386         a = pixel >> 24;
1387         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1388         dst++;
1389     }
1390     return n * 4;
1391 }
1392
1393 #define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
1394
1395 #define ISTRANSL(pixel, fmt)    \
1396     ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
1397
1398 /* convert surface to be quickly alpha-blittable onto dest, if possible */
1399 static int RLEAlphaSurface(SDL_Surface *surface)
1400 {
1401     SDL_Surface *dest;
1402     SDL_PixelFormat *df;
1403     int maxsize = 0;
1404     int max_opaque_run;
1405     int max_transl_run = 65535;
1406     unsigned masksum;
1407     Uint8 *rlebuf, *dst;
1408     int (*copy_opaque)(void *, Uint32 *, int,
1409                        SDL_PixelFormat *, SDL_PixelFormat *);
1410     int (*copy_transl)(void *, Uint32 *, int,
1411                        SDL_PixelFormat *, SDL_PixelFormat *);
1412
1413     dest = surface->map->dst;
1414     if(!dest)
1415         return -1;
1416     df = dest->format;
1417     if(surface->format->BitsPerPixel != 32)
1418         return -1;              /* only 32bpp source supported */
1419
1420     /* find out whether the destination is one we support,
1421        and determine the max size of the encoded result */
1422     masksum = df->Rmask | df->Gmask | df->Bmask;
1423     switch(df->BytesPerPixel) {
1424     case 2:
1425         /* 16bpp: only support 565 and 555 formats */
1426         switch(masksum) {
1427         case 0xffff:
1428             if(df->Gmask == 0x07e0
1429                || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
1430                 copy_opaque = copy_opaque_16;
1431                 copy_transl = copy_transl_565;
1432             } else
1433                 return -1;
1434             break;
1435         case 0x7fff:
1436             if(df->Gmask == 0x03e0
1437                || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
1438                 copy_opaque = copy_opaque_16;
1439                 copy_transl = copy_transl_555;
1440             } else
1441                 return -1;
1442             break;
1443         default:
1444             return -1;
1445         }
1446         max_opaque_run = 255;   /* runs stored as bytes */
1447
1448         /* worst case is alternating opaque and translucent pixels,
1449            with room for alignment padding between lines */
1450         maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
1451         break;
1452     case 4:
1453         if(masksum != 0x00ffffff)
1454             return -1;          /* requires unused high byte */
1455         copy_opaque = copy_32;
1456         copy_transl = copy_32;
1457         max_opaque_run = 255;   /* runs stored as short ints */
1458
1459         /* worst case is alternating opaque and translucent pixels */
1460         maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
1461         break;
1462     default:
1463         return -1;              /* anything else unsupported right now */
1464     }
1465
1466     maxsize += sizeof(RLEDestFormat);
1467     rlebuf = (Uint8 *)SDL_malloc(maxsize);
1468     if(!rlebuf) {
1469         SDL_OutOfMemory();
1470         return -1;
1471     }
1472     {
1473         /* save the destination format so we can undo the encoding later */
1474         RLEDestFormat *r = (RLEDestFormat *)rlebuf;
1475         r->BytesPerPixel = df->BytesPerPixel;
1476         r->Rloss = df->Rloss;
1477         r->Gloss = df->Gloss;
1478         r->Bloss = df->Bloss;
1479         r->Rshift = df->Rshift;
1480         r->Gshift = df->Gshift;
1481         r->Bshift = df->Bshift;
1482         r->Ashift = df->Ashift;
1483         r->Rmask = df->Rmask;
1484         r->Gmask = df->Gmask;
1485         r->Bmask = df->Bmask;
1486         r->Amask = df->Amask;
1487     }
1488     dst = rlebuf + sizeof(RLEDestFormat);
1489
1490     /* Do the actual encoding */
1491     {
1492         int x, y;
1493         int h = surface->h, w = surface->w;
1494         SDL_PixelFormat *sf = surface->format;
1495         Uint32 *src = (Uint32 *)surface->pixels;
1496         Uint8 *lastline = dst;  /* end of last non-blank line */
1497
1498         /* opaque counts are 8 or 16 bits, depending on target depth */
1499 #define ADD_OPAQUE_COUNTS(n, m)                 \
1500         if(df->BytesPerPixel == 4) {            \
1501             ((Uint16 *)dst)[0] = n;             \
1502             ((Uint16 *)dst)[1] = m;             \
1503             dst += 4;                           \
1504         } else {                                \
1505             dst[0] = n;                         \
1506             dst[1] = m;                         \
1507             dst += 2;                           \
1508         }
1509
1510         /* translucent counts are always 16 bit */
1511 #define ADD_TRANSL_COUNTS(n, m)         \
1512         (((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
1513
1514         for(y = 0; y < h; y++) {
1515             int runstart, skipstart;
1516             int blankline = 0;
1517             /* First encode all opaque pixels of a scan line */
1518             x = 0;
1519             do {
1520                 int run, skip, len;
1521                 skipstart = x;
1522                 while(x < w && !ISOPAQUE(src[x], sf))
1523                     x++;
1524                 runstart = x;
1525                 while(x < w && ISOPAQUE(src[x], sf))
1526                     x++;
1527                 skip = runstart - skipstart;
1528                 if(skip == w)
1529                     blankline = 1;
1530                 run = x - runstart;
1531                 while(skip > max_opaque_run) {
1532                     ADD_OPAQUE_COUNTS(max_opaque_run, 0);
1533                     skip -= max_opaque_run;
1534                 }
1535                 len = MIN(run, max_opaque_run);
1536                 ADD_OPAQUE_COUNTS(skip, len);
1537                 dst += copy_opaque(dst, src + runstart, len, sf, df);
1538                 runstart += len;
1539                 run -= len;
1540                 while(run) {
1541                     len = MIN(run, max_opaque_run);
1542                     ADD_OPAQUE_COUNTS(0, len);
1543                     dst += copy_opaque(dst, src + runstart, len, sf, df);
1544                     runstart += len;
1545                     run -= len;
1546                 }
1547             } while(x < w);
1548
1549             /* Make sure the next output address is 32-bit aligned */
1550             dst += (uintptr_t)dst & 2;
1551
1552             /* Next, encode all translucent pixels of the same scan line */
1553             x = 0;
1554             do {
1555                 int run, skip, len;
1556                 skipstart = x;
1557                 while(x < w && !ISTRANSL(src[x], sf))
1558                     x++;
1559                 runstart = x;
1560                 while(x < w && ISTRANSL(src[x], sf))
1561                     x++;
1562                 skip = runstart - skipstart;
1563                 blankline &= (skip == w);
1564                 run = x - runstart;
1565                 while(skip > max_transl_run) {
1566                     ADD_TRANSL_COUNTS(max_transl_run, 0);
1567                     skip -= max_transl_run;
1568                 }
1569                 len = MIN(run, max_transl_run);
1570                 ADD_TRANSL_COUNTS(skip, len);
1571                 dst += copy_transl(dst, src + runstart, len, sf, df);
1572                 runstart += len;
1573                 run -= len;
1574                 while(run) {
1575                     len = MIN(run, max_transl_run);
1576                     ADD_TRANSL_COUNTS(0, len);
1577                     dst += copy_transl(dst, src + runstart, len, sf, df);
1578                     runstart += len;
1579                     run -= len;
1580                 }
1581                 if(!blankline)
1582                     lastline = dst;
1583             } while(x < w);
1584
1585             src += surface->pitch >> 2;
1586         }
1587         dst = lastline;         /* back up past trailing blank lines */
1588         ADD_OPAQUE_COUNTS(0, 0);
1589     }
1590
1591 #undef ADD_OPAQUE_COUNTS
1592 #undef ADD_TRANSL_COUNTS
1593
1594     /* Now that we have it encoded, release the original pixels */
1595     if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1596        && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1597         SDL_free( surface->pixels );
1598         surface->pixels = NULL;
1599     }
1600
1601     /* realloc the buffer to release unused memory */
1602     {
1603         Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1604         if(!p)
1605             p = rlebuf;
1606         surface->map->sw_data->aux_data = p;
1607     }
1608
1609     return 0;
1610 }
1611
1612 static Uint32 getpix_8(Uint8 *srcbuf)
1613 {
1614     return *srcbuf;
1615 }
1616
1617 static Uint32 getpix_16(Uint8 *srcbuf)
1618 {
1619     return *(Uint16 *)srcbuf;
1620 }
1621
1622 static Uint32 getpix_24(Uint8 *srcbuf)
1623 {
1624 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
1625     return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
1626 #else
1627     return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
1628 #endif
1629 }
1630
1631 static Uint32 getpix_32(Uint8 *srcbuf)
1632 {
1633     return *(Uint32 *)srcbuf;
1634 }
1635
1636 typedef Uint32 (*getpix_func)(Uint8 *);
1637
1638 static getpix_func getpixes[4] = {
1639     getpix_8, getpix_16, getpix_24, getpix_32
1640 };
1641
1642 static int RLEColorkeySurface(SDL_Surface *surface)
1643 {
1644         Uint8 *rlebuf, *dst;
1645         int maxn;
1646         int y;
1647         Uint8 *srcbuf, *curbuf, *lastline;
1648         int maxsize = 0;
1649         int skip, run;
1650         int bpp = surface->format->BytesPerPixel;
1651         getpix_func getpix;
1652         Uint32 ckey, rgbmask;
1653         int w, h;
1654
1655         /* calculate the worst case size for the compressed surface */
1656         switch(bpp) {
1657         case 1:
1658             /* worst case is alternating opaque and transparent pixels,
1659                starting with an opaque pixel */
1660             maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
1661             break;
1662         case 2:
1663         case 3:
1664             /* worst case is solid runs, at most 255 pixels wide */
1665             maxsize = surface->h * (2 * (surface->w / 255 + 1)
1666                                     + surface->w * bpp) + 2;
1667             break;
1668         case 4:
1669             /* worst case is solid runs, at most 65535 pixels wide */
1670             maxsize = surface->h * (4 * (surface->w / 65535 + 1)
1671                                     + surface->w * 4) + 4;
1672             break;
1673         }
1674
1675         rlebuf = (Uint8 *)SDL_malloc(maxsize);
1676         if ( rlebuf == NULL ) {
1677                 SDL_OutOfMemory();
1678                 return(-1);
1679         }
1680
1681         /* Set up the conversion */
1682         srcbuf = (Uint8 *)surface->pixels;
1683         curbuf = srcbuf;
1684         maxn = bpp == 4 ? 65535 : 255;
1685         skip = run = 0;
1686         dst = rlebuf;
1687         rgbmask = ~surface->format->Amask;
1688         ckey = surface->format->colorkey & rgbmask;
1689         lastline = dst;
1690         getpix = getpixes[bpp - 1];
1691         w = surface->w;
1692         h = surface->h;
1693
1694 #define ADD_COUNTS(n, m)                        \
1695         if(bpp == 4) {                          \
1696             ((Uint16 *)dst)[0] = n;             \
1697             ((Uint16 *)dst)[1] = m;             \
1698             dst += 4;                           \
1699         } else {                                \
1700             dst[0] = n;                         \
1701             dst[1] = m;                         \
1702             dst += 2;                           \
1703         }
1704
1705         for(y = 0; y < h; y++) {
1706             int x = 0;
1707             int blankline = 0;
1708             do {
1709                 int run, skip, len;
1710                 int runstart;
1711                 int skipstart = x;
1712
1713                 /* find run of transparent, then opaque pixels */
1714                 while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
1715                     x++;
1716                 runstart = x;
1717                 while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
1718                     x++;
1719                 skip = runstart - skipstart;
1720                 if(skip == w)
1721                     blankline = 1;
1722                 run = x - runstart;
1723
1724                 /* encode segment */
1725                 while(skip > maxn) {
1726                     ADD_COUNTS(maxn, 0);
1727                     skip -= maxn;
1728                 }
1729                 len = MIN(run, maxn);
1730                 ADD_COUNTS(skip, len);
1731                 SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1732                 dst += len * bpp;
1733                 run -= len;
1734                 runstart += len;
1735                 while(run) {
1736                     len = MIN(run, maxn);
1737                     ADD_COUNTS(0, len);
1738                     SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1739                     dst += len * bpp;
1740                     runstart += len;
1741                     run -= len;
1742                 }
1743                 if(!blankline)
1744                     lastline = dst;
1745             } while(x < w);
1746
1747             srcbuf += surface->pitch;
1748         }
1749         dst = lastline;         /* back up bast trailing blank lines */
1750         ADD_COUNTS(0, 0);
1751
1752 #undef ADD_COUNTS
1753
1754         /* Now that we have it encoded, release the original pixels */
1755         if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1756            && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1757             SDL_free( surface->pixels );
1758             surface->pixels = NULL;
1759         }
1760
1761         /* realloc the buffer to release unused memory */
1762         {
1763             /* If realloc returns NULL, the original block is left intact */
1764             Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1765             if(!p)
1766                 p = rlebuf;
1767             surface->map->sw_data->aux_data = p;
1768         }
1769
1770         return(0);
1771 }
1772
1773 int SDL_RLESurface(SDL_Surface *surface)
1774 {
1775         int retcode;
1776
1777         /* Clear any previous RLE conversion */
1778         if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1779                 SDL_UnRLESurface(surface, 1);
1780         }
1781
1782         /* We don't support RLE encoding of bitmaps */
1783         if ( surface->format->BitsPerPixel < 8 ) {
1784                 return(-1);
1785         }
1786
1787         /* Lock the surface if it's in hardware */
1788         if ( SDL_MUSTLOCK(surface) ) {
1789                 if ( SDL_LockSurface(surface) < 0 ) {
1790                         return(-1);
1791                 }
1792         }
1793
1794         /* Encode */
1795         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1796             retcode = RLEColorkeySurface(surface);
1797         } else {
1798             if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
1799                && surface->format->Amask != 0)
1800                 retcode = RLEAlphaSurface(surface);
1801             else
1802                 retcode = -1;   /* no RLE for per-surface alpha sans ckey */
1803         }
1804
1805         /* Unlock the surface if it's in hardware */
1806         if ( SDL_MUSTLOCK(surface) ) {
1807                 SDL_UnlockSurface(surface);
1808         }
1809
1810         if(retcode < 0)
1811             return -1;
1812
1813         /* The surface is now accelerated */
1814         surface->flags |= SDL_RLEACCEL;
1815
1816         return(0);
1817 }
1818
1819 /*
1820  * Un-RLE a surface with pixel alpha
1821  * This may not give back exactly the image before RLE-encoding; all
1822  * completely transparent pixels will be lost, and colour and alpha depth
1823  * may have been reduced (when encoding for 16bpp targets).
1824  */
1825 static SDL_bool UnRLEAlpha(SDL_Surface *surface)
1826 {
1827     Uint8 *srcbuf;
1828     Uint32 *dst;
1829     SDL_PixelFormat *sf = surface->format;
1830     RLEDestFormat *df = surface->map->sw_data->aux_data;
1831     int (*uncopy_opaque)(Uint32 *, void *, int,
1832                          RLEDestFormat *, SDL_PixelFormat *);
1833     int (*uncopy_transl)(Uint32 *, void *, int,
1834                          RLEDestFormat *, SDL_PixelFormat *);
1835     int w = surface->w;
1836     int bpp = df->BytesPerPixel;
1837
1838     if(bpp == 2) {
1839         uncopy_opaque = uncopy_opaque_16;
1840         uncopy_transl = uncopy_transl_16;
1841     } else {
1842         uncopy_opaque = uncopy_transl = uncopy_32;
1843     }
1844
1845     surface->pixels = SDL_malloc(surface->h * surface->pitch);
1846     if ( !surface->pixels ) {
1847         return(SDL_FALSE);
1848     }
1849     /* fill background with transparent pixels */
1850     SDL_memset(surface->pixels, 0, surface->h * surface->pitch);
1851
1852     dst = surface->pixels;
1853     srcbuf = (Uint8 *)(df + 1);
1854     for(;;) {
1855         /* copy opaque pixels */
1856         int ofs = 0;
1857         do {
1858             unsigned run;
1859             if(bpp == 2) {
1860                 ofs += srcbuf[0];
1861                 run = srcbuf[1];
1862                 srcbuf += 2;
1863             } else {
1864                 ofs += ((Uint16 *)srcbuf)[0];
1865                 run = ((Uint16 *)srcbuf)[1];
1866                 srcbuf += 4;
1867             }
1868             if(run) {
1869                 srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
1870                 ofs += run;
1871             } else if(!ofs)
1872                 return(SDL_TRUE);
1873         } while(ofs < w);
1874
1875         /* skip padding if needed */
1876         if(bpp == 2)
1877             srcbuf += (uintptr_t)srcbuf & 2;
1878         
1879         /* copy translucent pixels */
1880         ofs = 0;
1881         do {
1882             unsigned run;
1883             ofs += ((Uint16 *)srcbuf)[0];
1884             run = ((Uint16 *)srcbuf)[1];
1885             srcbuf += 4;
1886             if(run) {
1887                 srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
1888                 ofs += run;
1889             }
1890         } while(ofs < w);
1891         dst += surface->pitch >> 2;
1892     }
1893     /* Make the compiler happy */
1894     return(SDL_TRUE);
1895 }
1896
1897 void SDL_UnRLESurface(SDL_Surface *surface, int recode)
1898 {
1899     if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1900         surface->flags &= ~SDL_RLEACCEL;
1901
1902         if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1903            && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1904             if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1905                 SDL_Rect full;
1906                 unsigned alpha_flag;
1907
1908                 /* re-create the original surface */
1909                 surface->pixels = SDL_malloc(surface->h * surface->pitch);
1910                 if ( !surface->pixels ) {
1911                         /* Oh crap... */
1912                         surface->flags |= SDL_RLEACCEL;
1913                         return;
1914                 }
1915
1916                 /* fill it with the background colour */
1917                 SDL_FillRect(surface, NULL, surface->format->colorkey);
1918
1919                 /* now render the encoded surface */
1920                 full.x = full.y = 0;
1921                 full.w = surface->w;
1922                 full.h = surface->h;
1923                 alpha_flag = surface->flags & SDL_SRCALPHA;
1924                 surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
1925                 SDL_RLEBlit(surface, &full, surface, &full);
1926                 surface->flags |= alpha_flag;
1927             } else {
1928                 if ( !UnRLEAlpha(surface) ) {
1929                     /* Oh crap... */
1930                     surface->flags |= SDL_RLEACCEL;
1931                     return;
1932                 }
1933             }
1934         }
1935
1936         if ( surface->map && surface->map->sw_data->aux_data ) {
1937             SDL_free(surface->map->sw_data->aux_data);
1938             surface->map->sw_data->aux_data = NULL;
1939         }
1940     }
1941 }
1942
1943