SDL-1.2.14
[sdl_omap.git] / src / hermes / mmxp2_32.asm
CommitLineData
e14743d1 1;
2; pII-optimised MMX format converters for HERMES
3; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5; This source code is licensed under the GNU LGPL
6;
7; Please refer to the file COPYING.LIB contained in the distribution for
8; licensing conditions
9;
10; COPYRIGHT NOTICE
11;
12; This file partly contains code that is (c) Intel Corporation, specifically
13; the mode detection routine, and the converter to 15 bit (8 pixel
14; conversion routine from the mmx programming tutorial pages).
15;
16;
17; These routines aren't exactly pII optimised - it's just that as they
18; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
19; optimise them for p5 MMXs..
20
21BITS 32
22
23%include "common.inc"
24
25SDL_FUNC _ConvertMMXpII32_24RGB888
26SDL_FUNC _ConvertMMXpII32_16RGB565
27SDL_FUNC _ConvertMMXpII32_16BGR565
28SDL_FUNC _ConvertMMXpII32_16RGB555
29SDL_FUNC _ConvertMMXpII32_16BGR555
30
31;; Macros for conversion routines
32
33%macro _push_immq_mask 1
34 push dword %1
35 push dword %1
36%endmacro
37
38%macro load_immq 2
39 _push_immq_mask %2
40 movq %1, [esp]
41%endmacro
42
43%macro pand_immq 2
44 _push_immq_mask %2
45 pand %1, [esp]
46%endmacro
47
48%define CLEANUP_IMMQ_LOADS(num) \
49 add esp, byte 8 * num
50
51%define mmx32_rgb888_mask 00ffffffh
52%define mmx32_rgb565_b 000000f8h
53%define mmx32_rgb565_g 0000fc00h
54%define mmx32_rgb565_r 00f80000h
55
56%define mmx32_rgb555_rb 00f800f8h
57%define mmx32_rgb555_g 0000f800h
58%define mmx32_rgb555_mul 20000008h
59%define mmx32_bgr555_mul 00082000h
60
61SECTION .text
62
63_ConvertMMXpII32_24RGB888:
64
65 ; set up mm6 as the mask, mm7 as zero
66 load_immq mm6, mmx32_rgb888_mask
67 CLEANUP_IMMQ_LOADS(1)
68 pxor mm7, mm7
69
70 mov edx, ecx ; save ecx
71 and ecx, 0fffffffch ; clear lower two bits
72 jnz .L1
73 jmp .L2
74
75.L1:
76
77 movq mm0, [esi] ; A R G B a r g b
78 pand mm0, mm6 ; 0 R G B 0 r g b
79 movq mm1, [esi+8] ; A R G B a r g b
80 pand mm1, mm6 ; 0 R G B 0 r g b
81
82 movq mm2, mm0 ; 0 R G B 0 r g b
83 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
84 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
85 psllq mm2, 24 ; 0 0 R G B 0 0 0
86 por mm0, mm2 ; 0 0 R G B r g b
87
88 movq mm3, mm1 ; 0 R G B 0 r g b
89 psllq mm3, 48 ; g b 0 0 0 0 0 0
90 por mm0, mm3 ; g b R G B r g b
91
92 movq mm4, mm1 ; 0 R G B 0 r g b
93 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
94 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
95 psrlq mm1, 16 ; 0 0 0 R G B 0 r
96 psllq mm4, 8 ; 0 0 0 0 R G B 0
97 por mm1, mm4 ; 0 0 0 0 R G B r
98
99 movq [edi], mm0
100 add esi, BYTE 16
101 movd [edi+8], mm1
102 add edi, BYTE 12
103 sub ecx, BYTE 4
104 jnz .L1
105
106.L2:
107 mov ecx, edx
108 and ecx, BYTE 3
109 jz .L4
110.L3:
111 mov al, [esi]
112 mov bl, [esi+1]
113 mov dl, [esi+2]
114 mov [edi], al
115 mov [edi+1], bl
116 mov [edi+2], dl
117 add esi, BYTE 4
118 add edi, BYTE 3
119 dec ecx
120 jnz .L3
121.L4:
122 return
123
124
125
126_ConvertMMXpII32_16RGB565:
127
128 ; set up masks
129 load_immq mm5, mmx32_rgb565_b
130 load_immq mm6, mmx32_rgb565_g
131 load_immq mm7, mmx32_rgb565_r
132 CLEANUP_IMMQ_LOADS(3)
133
134 mov edx, ecx
135 shr ecx, 2
136 jnz .L1
137 jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
138
139.L1:
140 movq mm0, [esi] ; argb
141 movq mm1, mm0 ; argb
142 pand mm0, mm6 ; 00g0
143 movq mm3, mm1 ; argb
144 pand mm1, mm5 ; 000b
145 pand mm3, mm7 ; 0r00
146 pslld mm1, 2 ; 0 0 000000bb bbb00000
147 por mm0, mm1 ; 0 0 ggggggbb bbb00000
148 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
149
150 movq mm4, [esi+8] ; argb
151 movq mm2, mm4 ; argb
152 pand mm4, mm6 ; 00g0
153 movq mm1, mm2 ; argb
154 pand mm2, mm5 ; 000b
155 pand mm1, mm7 ; 0r00
156 pslld mm2, 2 ; 0 0 000000bb bbb00000
157 por mm4, mm2 ; 0 0 ggggggbb bbb00000
158 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
159
160 packuswb mm3, mm1 ; R 0 r 0
161 packssdw mm0, mm4 ; as above.. ish
162 por mm0, mm3 ; done.
163 movq [edi], mm0
164
165 add esi, 16
166 add edi, 8
167 dec ecx
168 jnz .L1
169
170.L2:
171 mov ecx, edx
172 and ecx, BYTE 3
173 jz .L4
174.L3:
175 mov al, [esi]
176 mov bh, [esi+1]
177 mov ah, [esi+2]
178 shr al, 3
179 and eax, 0F81Fh ; BYTE?
180 shr ebx, 5
181 and ebx, 07E0h ; BYTE?
182 add eax, ebx
183 mov [edi], al
184 mov [edi+1], ah
185 add esi, BYTE 4
186 add edi, BYTE 2
187 dec ecx
188 jnz .L3
189
190.L4:
191 retn
192
193
194_ConvertMMXpII32_16BGR565:
195
196 load_immq mm5, mmx32_rgb565_r
197 load_immq mm6, mmx32_rgb565_g
198 load_immq mm7, mmx32_rgb565_b
199 CLEANUP_IMMQ_LOADS(3)
200
201 mov edx, ecx
202 shr ecx, 2
203 jnz .L1
204 jmp .L2
205
206.L1:
207 movq mm0, [esi] ; a r g b
208 movq mm1, mm0 ; a r g b
209 pand mm0, mm6 ; 0 0 g 0
210 movq mm3, mm1 ; a r g b
211 pand mm1, mm5 ; 0 r 0 0
212 pand mm3, mm7 ; 0 0 0 b
213
214 psllq mm3, 16 ; 0 b 0 0
215 psrld mm1, 14 ; 0 0 000000rr rrr00000
216 por mm0, mm1 ; 0 0 ggggggrr rrr00000
217 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
218
219 movq mm4, [esi+8] ; a r g b
220 movq mm2, mm4 ; a r g b
221 pand mm4, mm6 ; 0 0 g 0
222 movq mm1, mm2 ; a r g b
223 pand mm2, mm5 ; 0 r 0 0
224 pand mm1, mm7 ; 0 0 0 b
225
226 psllq mm1, 16 ; 0 b 0 0
227 psrld mm2, 14 ; 0 0 000000rr rrr00000
228 por mm4, mm2 ; 0 0 ggggggrr rrr00000
229 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
230
231 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
232 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
233 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
234 movq [edi], mm0
235
236 add esi, BYTE 16
237 add edi, BYTE 8
238 dec ecx
239 jnz .L1
240
241.L2:
242 and edx, BYTE 3
243 jz .L4
244.L3:
245 mov al, [esi+2]
246 mov bh, [esi+1]
247 mov ah, [esi]
248 shr al, 3
249 and eax, 0F81Fh ; BYTE ?
250 shr ebx, 5
251 and ebx, 07E0h ; BYTE ?
252 add eax, ebx
253 mov [edi], al
254 mov [edi+1], ah
255 add esi, BYTE 4
256 add edi, BYTE 2
257 dec edx
258 jnz .L3
259
260.L4:
261 retn
262
263_ConvertMMXpII32_16BGR555:
264
265 ; the 16BGR555 converter is identical to the RGB555 one,
266 ; except it uses a different multiplier for the pmaddwd
267 ; instruction. cool huh.
268
269 load_immq mm7, mmx32_bgr555_mul
270 jmp _convert_bgr555_cheat
271
272; This is the same as the Intel version.. they obviously went to
273; much more trouble to expand/coil the loop than I did, so theirs
274; would almost certainly be faster, even if only a little.
275; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
276; (I think) a more accurate name..
277_ConvertMMXpII32_16RGB555:
278
279 load_immq mm7, mmx32_rgb555_mul
280_convert_bgr555_cheat:
281 load_immq mm6, mmx32_rgb555_g
282 CLEANUP_IMMQ_LOADS(2)
283
284 mov edx,ecx ; Save ecx
285
286 and ecx,DWORD 0fffffff8h ; clear lower three bits
287 jnz .L_OK
288 jmp near .L2
289
290.L_OK:
291
292 movq mm2,[esi+8]
293
294 movq mm0,[esi]
295 movq mm3,mm2
296
297 pand_immq mm3, mmx32_rgb555_rb
298 movq mm1,mm0
299
300 pand_immq mm1, mmx32_rgb555_rb
301 pmaddwd mm3,mm7
302
303 CLEANUP_IMMQ_LOADS(2)
304
305 pmaddwd mm1,mm7
306 pand mm2,mm6
307
308.L1:
309 movq mm4,[esi+24]
310 pand mm0,mm6
311
312 movq mm5,[esi+16]
313 por mm3,mm2
314
315 psrld mm3,6
316 por mm1,mm0
317
318 movq mm0,mm4
319 psrld mm1,6
320
321 pand_immq mm0, mmx32_rgb555_rb
322 packssdw mm1,mm3
323
324 movq mm3,mm5
325 pmaddwd mm0,mm7
326
327 pand_immq mm3, mmx32_rgb555_rb
328 pand mm4,mm6
329
330 movq [edi],mm1
331 pmaddwd mm3,mm7
332
333 add esi,BYTE 32
334 por mm4,mm0
335
336 pand mm5,mm6
337 psrld mm4,6
338
339 movq mm2,[esi+8]
340 por mm5,mm3
341
342 movq mm0,[esi]
343 psrld mm5,6
344
345 movq mm3,mm2
346 movq mm1,mm0
347
348 pand_immq mm3, mmx32_rgb555_rb
349 packssdw mm5,mm4
350
351 pand_immq mm1, mmx32_rgb555_rb
352 pand mm2,mm6
353
354 CLEANUP_IMMQ_LOADS(4)
355
356 movq [edi+8],mm5
357 pmaddwd mm3,mm7
358
359 pmaddwd mm1,mm7
360 add edi,BYTE 16
361
362 sub ecx,BYTE 8
363 jz .L2
364 jmp .L1
365
366
367.L2:
368 mov ecx,edx
369
370 and ecx,BYTE 7
371 jz .L4
372
373.L3:
374 mov ebx,[esi]
375 add esi,BYTE 4
376
377 mov eax,ebx
378 mov edx,ebx
379
380 shr eax,3
381 shr edx,6
382
383 and eax,BYTE 0000000000011111b
384 and edx, 0000001111100000b
385
386 shr ebx,9
387
388 or eax,edx
389
390 and ebx, 0111110000000000b
391
392 or eax,ebx
393
394 mov [edi],ax
395 add edi,BYTE 2
396
397 dec ecx
398 jnz .L3
399
400.L4:
401 retn
402
403%ifidn __OUTPUT_FORMAT__,elf
404section .note.GNU-stack noalloc noexec nowrite progbits
405%endif