e14743d1 |
1 | ; |
2 | ; pII-optimised MMX format converters for HERMES |
3 | ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) |
4 | ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) |
5 | ; This source code is licensed under the GNU LGPL |
6 | ; |
7 | ; Please refer to the file COPYING.LIB contained in the distribution for |
8 | ; licensing conditions |
9 | ; |
10 | ; COPYRIGHT NOTICE |
11 | ; |
12 | ; This file partly contains code that is (c) Intel Corporation, specifically |
13 | ; the mode detection routine, and the converter to 15 bit (8 pixel |
14 | ; conversion routine from the mmx programming tutorial pages). |
15 | ; |
16 | ; |
17 | ; These routines aren't exactly pII optimised - it's just that as they |
18 | ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to |
19 | ; optimise them for p5 MMXs.. |
20 | |
21 | BITS 32 |
22 | |
23 | %include "common.inc" |
24 | |
25 | SDL_FUNC _ConvertMMXpII32_24RGB888 |
26 | SDL_FUNC _ConvertMMXpII32_16RGB565 |
27 | SDL_FUNC _ConvertMMXpII32_16BGR565 |
28 | SDL_FUNC _ConvertMMXpII32_16RGB555 |
29 | SDL_FUNC _ConvertMMXpII32_16BGR555 |
30 | |
31 | ;; Macros for conversion routines |
32 | |
33 | %macro _push_immq_mask 1 |
34 | push dword %1 |
35 | push dword %1 |
36 | %endmacro |
37 | |
38 | %macro load_immq 2 |
39 | _push_immq_mask %2 |
40 | movq %1, [esp] |
41 | %endmacro |
42 | |
43 | %macro pand_immq 2 |
44 | _push_immq_mask %2 |
45 | pand %1, [esp] |
46 | %endmacro |
47 | |
48 | %define CLEANUP_IMMQ_LOADS(num) \ |
49 | add esp, byte 8 * num |
50 | |
51 | %define mmx32_rgb888_mask 00ffffffh |
52 | %define mmx32_rgb565_b 000000f8h |
53 | %define mmx32_rgb565_g 0000fc00h |
54 | %define mmx32_rgb565_r 00f80000h |
55 | |
56 | %define mmx32_rgb555_rb 00f800f8h |
57 | %define mmx32_rgb555_g 0000f800h |
58 | %define mmx32_rgb555_mul 20000008h |
59 | %define mmx32_bgr555_mul 00082000h |
60 | |
61 | SECTION .text |
62 | |
63 | _ConvertMMXpII32_24RGB888: |
64 | |
65 | ; set up mm6 as the mask, mm7 as zero |
66 | load_immq mm6, mmx32_rgb888_mask |
67 | CLEANUP_IMMQ_LOADS(1) |
68 | pxor mm7, mm7 |
69 | |
70 | mov edx, ecx ; save ecx |
71 | and ecx, 0fffffffch ; clear lower two bits |
72 | jnz .L1 |
73 | jmp .L2 |
74 | |
75 | .L1: |
76 | |
77 | movq mm0, [esi] ; A R G B a r g b |
78 | pand mm0, mm6 ; 0 R G B 0 r g b |
79 | movq mm1, [esi+8] ; A R G B a r g b |
80 | pand mm1, mm6 ; 0 R G B 0 r g b |
81 | |
82 | movq mm2, mm0 ; 0 R G B 0 r g b |
83 | punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B |
84 | punpckldq mm0, mm7 ; 0 0 0 0 0 r g b |
85 | psllq mm2, 24 ; 0 0 R G B 0 0 0 |
86 | por mm0, mm2 ; 0 0 R G B r g b |
87 | |
88 | movq mm3, mm1 ; 0 R G B 0 r g b |
89 | psllq mm3, 48 ; g b 0 0 0 0 0 0 |
90 | por mm0, mm3 ; g b R G B r g b |
91 | |
92 | movq mm4, mm1 ; 0 R G B 0 r g b |
93 | punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B |
94 | punpckldq mm1, mm7 ; 0 0 0 0 0 r g b |
95 | psrlq mm1, 16 ; 0 0 0 R G B 0 r |
96 | psllq mm4, 8 ; 0 0 0 0 R G B 0 |
97 | por mm1, mm4 ; 0 0 0 0 R G B r |
98 | |
99 | movq [edi], mm0 |
100 | add esi, BYTE 16 |
101 | movd [edi+8], mm1 |
102 | add edi, BYTE 12 |
103 | sub ecx, BYTE 4 |
104 | jnz .L1 |
105 | |
106 | .L2: |
107 | mov ecx, edx |
108 | and ecx, BYTE 3 |
109 | jz .L4 |
110 | .L3: |
111 | mov al, [esi] |
112 | mov bl, [esi+1] |
113 | mov dl, [esi+2] |
114 | mov [edi], al |
115 | mov [edi+1], bl |
116 | mov [edi+2], dl |
117 | add esi, BYTE 4 |
118 | add edi, BYTE 3 |
119 | dec ecx |
120 | jnz .L3 |
121 | .L4: |
122 | return |
123 | |
124 | |
125 | |
126 | _ConvertMMXpII32_16RGB565: |
127 | |
128 | ; set up masks |
129 | load_immq mm5, mmx32_rgb565_b |
130 | load_immq mm6, mmx32_rgb565_g |
131 | load_immq mm7, mmx32_rgb565_r |
132 | CLEANUP_IMMQ_LOADS(3) |
133 | |
134 | mov edx, ecx |
135 | shr ecx, 2 |
136 | jnz .L1 |
137 | jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
138 | |
139 | .L1: |
140 | movq mm0, [esi] ; argb |
141 | movq mm1, mm0 ; argb |
142 | pand mm0, mm6 ; 00g0 |
143 | movq mm3, mm1 ; argb |
144 | pand mm1, mm5 ; 000b |
145 | pand mm3, mm7 ; 0r00 |
146 | pslld mm1, 2 ; 0 0 000000bb bbb00000 |
147 | por mm0, mm1 ; 0 0 ggggggbb bbb00000 |
148 | psrld mm0, 5 ; 0 0 00000ggg gggbbbbb |
149 | |
150 | movq mm4, [esi+8] ; argb |
151 | movq mm2, mm4 ; argb |
152 | pand mm4, mm6 ; 00g0 |
153 | movq mm1, mm2 ; argb |
154 | pand mm2, mm5 ; 000b |
155 | pand mm1, mm7 ; 0r00 |
156 | pslld mm2, 2 ; 0 0 000000bb bbb00000 |
157 | por mm4, mm2 ; 0 0 ggggggbb bbb00000 |
158 | psrld mm4, 5 ; 0 0 00000ggg gggbbbbb |
159 | |
160 | packuswb mm3, mm1 ; R 0 r 0 |
161 | packssdw mm0, mm4 ; as above.. ish |
162 | por mm0, mm3 ; done. |
163 | movq [edi], mm0 |
164 | |
165 | add esi, 16 |
166 | add edi, 8 |
167 | dec ecx |
168 | jnz .L1 |
169 | |
170 | .L2: |
171 | mov ecx, edx |
172 | and ecx, BYTE 3 |
173 | jz .L4 |
174 | .L3: |
175 | mov al, [esi] |
176 | mov bh, [esi+1] |
177 | mov ah, [esi+2] |
178 | shr al, 3 |
179 | and eax, 0F81Fh ; BYTE? |
180 | shr ebx, 5 |
181 | and ebx, 07E0h ; BYTE? |
182 | add eax, ebx |
183 | mov [edi], al |
184 | mov [edi+1], ah |
185 | add esi, BYTE 4 |
186 | add edi, BYTE 2 |
187 | dec ecx |
188 | jnz .L3 |
189 | |
190 | .L4: |
191 | retn |
192 | |
193 | |
194 | _ConvertMMXpII32_16BGR565: |
195 | |
196 | load_immq mm5, mmx32_rgb565_r |
197 | load_immq mm6, mmx32_rgb565_g |
198 | load_immq mm7, mmx32_rgb565_b |
199 | CLEANUP_IMMQ_LOADS(3) |
200 | |
201 | mov edx, ecx |
202 | shr ecx, 2 |
203 | jnz .L1 |
204 | jmp .L2 |
205 | |
206 | .L1: |
207 | movq mm0, [esi] ; a r g b |
208 | movq mm1, mm0 ; a r g b |
209 | pand mm0, mm6 ; 0 0 g 0 |
210 | movq mm3, mm1 ; a r g b |
211 | pand mm1, mm5 ; 0 r 0 0 |
212 | pand mm3, mm7 ; 0 0 0 b |
213 | |
214 | psllq mm3, 16 ; 0 b 0 0 |
215 | psrld mm1, 14 ; 0 0 000000rr rrr00000 |
216 | por mm0, mm1 ; 0 0 ggggggrr rrr00000 |
217 | psrld mm0, 5 ; 0 0 00000ggg gggrrrrr |
218 | |
219 | movq mm4, [esi+8] ; a r g b |
220 | movq mm2, mm4 ; a r g b |
221 | pand mm4, mm6 ; 0 0 g 0 |
222 | movq mm1, mm2 ; a r g b |
223 | pand mm2, mm5 ; 0 r 0 0 |
224 | pand mm1, mm7 ; 0 0 0 b |
225 | |
226 | psllq mm1, 16 ; 0 b 0 0 |
227 | psrld mm2, 14 ; 0 0 000000rr rrr00000 |
228 | por mm4, mm2 ; 0 0 ggggggrr rrr00000 |
229 | psrld mm4, 5 ; 0 0 00000ggg gggrrrrr |
230 | |
231 | packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 |
232 | packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR |
233 | por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr |
234 | movq [edi], mm0 |
235 | |
236 | add esi, BYTE 16 |
237 | add edi, BYTE 8 |
238 | dec ecx |
239 | jnz .L1 |
240 | |
241 | .L2: |
242 | and edx, BYTE 3 |
243 | jz .L4 |
244 | .L3: |
245 | mov al, [esi+2] |
246 | mov bh, [esi+1] |
247 | mov ah, [esi] |
248 | shr al, 3 |
249 | and eax, 0F81Fh ; BYTE ? |
250 | shr ebx, 5 |
251 | and ebx, 07E0h ; BYTE ? |
252 | add eax, ebx |
253 | mov [edi], al |
254 | mov [edi+1], ah |
255 | add esi, BYTE 4 |
256 | add edi, BYTE 2 |
257 | dec edx |
258 | jnz .L3 |
259 | |
260 | .L4: |
261 | retn |
262 | |
263 | _ConvertMMXpII32_16BGR555: |
264 | |
265 | ; the 16BGR555 converter is identical to the RGB555 one, |
266 | ; except it uses a different multiplier for the pmaddwd |
267 | ; instruction. cool huh. |
268 | |
269 | load_immq mm7, mmx32_bgr555_mul |
270 | jmp _convert_bgr555_cheat |
271 | |
272 | ; This is the same as the Intel version.. they obviously went to |
273 | ; much more trouble to expand/coil the loop than I did, so theirs |
274 | ; would almost certainly be faster, even if only a little. |
275 | ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
276 | ; (I think) a more accurate name.. |
277 | _ConvertMMXpII32_16RGB555: |
278 | |
279 | load_immq mm7, mmx32_rgb555_mul |
280 | _convert_bgr555_cheat: |
281 | load_immq mm6, mmx32_rgb555_g |
282 | CLEANUP_IMMQ_LOADS(2) |
283 | |
284 | mov edx,ecx ; Save ecx |
285 | |
286 | and ecx,DWORD 0fffffff8h ; clear lower three bits |
287 | jnz .L_OK |
288 | jmp near .L2 |
289 | |
290 | .L_OK: |
291 | |
292 | movq mm2,[esi+8] |
293 | |
294 | movq mm0,[esi] |
295 | movq mm3,mm2 |
296 | |
297 | pand_immq mm3, mmx32_rgb555_rb |
298 | movq mm1,mm0 |
299 | |
300 | pand_immq mm1, mmx32_rgb555_rb |
301 | pmaddwd mm3,mm7 |
302 | |
303 | CLEANUP_IMMQ_LOADS(2) |
304 | |
305 | pmaddwd mm1,mm7 |
306 | pand mm2,mm6 |
307 | |
308 | .L1: |
309 | movq mm4,[esi+24] |
310 | pand mm0,mm6 |
311 | |
312 | movq mm5,[esi+16] |
313 | por mm3,mm2 |
314 | |
315 | psrld mm3,6 |
316 | por mm1,mm0 |
317 | |
318 | movq mm0,mm4 |
319 | psrld mm1,6 |
320 | |
321 | pand_immq mm0, mmx32_rgb555_rb |
322 | packssdw mm1,mm3 |
323 | |
324 | movq mm3,mm5 |
325 | pmaddwd mm0,mm7 |
326 | |
327 | pand_immq mm3, mmx32_rgb555_rb |
328 | pand mm4,mm6 |
329 | |
330 | movq [edi],mm1 |
331 | pmaddwd mm3,mm7 |
332 | |
333 | add esi,BYTE 32 |
334 | por mm4,mm0 |
335 | |
336 | pand mm5,mm6 |
337 | psrld mm4,6 |
338 | |
339 | movq mm2,[esi+8] |
340 | por mm5,mm3 |
341 | |
342 | movq mm0,[esi] |
343 | psrld mm5,6 |
344 | |
345 | movq mm3,mm2 |
346 | movq mm1,mm0 |
347 | |
348 | pand_immq mm3, mmx32_rgb555_rb |
349 | packssdw mm5,mm4 |
350 | |
351 | pand_immq mm1, mmx32_rgb555_rb |
352 | pand mm2,mm6 |
353 | |
354 | CLEANUP_IMMQ_LOADS(4) |
355 | |
356 | movq [edi+8],mm5 |
357 | pmaddwd mm3,mm7 |
358 | |
359 | pmaddwd mm1,mm7 |
360 | add edi,BYTE 16 |
361 | |
362 | sub ecx,BYTE 8 |
363 | jz .L2 |
364 | jmp .L1 |
365 | |
366 | |
367 | .L2: |
368 | mov ecx,edx |
369 | |
370 | and ecx,BYTE 7 |
371 | jz .L4 |
372 | |
373 | .L3: |
374 | mov ebx,[esi] |
375 | add esi,BYTE 4 |
376 | |
377 | mov eax,ebx |
378 | mov edx,ebx |
379 | |
380 | shr eax,3 |
381 | shr edx,6 |
382 | |
383 | and eax,BYTE 0000000000011111b |
384 | and edx, 0000001111100000b |
385 | |
386 | shr ebx,9 |
387 | |
388 | or eax,edx |
389 | |
390 | and ebx, 0111110000000000b |
391 | |
392 | or eax,ebx |
393 | |
394 | mov [edi],ax |
395 | add edi,BYTE 2 |
396 | |
397 | dec ecx |
398 | jnz .L3 |
399 | |
400 | .L4: |
401 | retn |
402 | |
403 | %ifidn __OUTPUT_FORMAT__,elf |
404 | section .note.GNU-stack noalloc noexec nowrite progbits |
405 | %endif |