2 ; pII-optimised MMX format converters for HERMES
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5 ; This source code is licensed under the GNU LGPL
7 ; Please refer to the file COPYING.LIB contained in the distribution for
12 ; This file partly contains code that is (c) Intel Corporation, specifically
13 ; the mode detection routine, and the converter to 15 bit (8 pixel
14 ; conversion routine from the mmx programming tutorial pages).
17 ; These routines aren't exactly pII optimised - it's just that as they
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
19 ; optimise them for p5 MMXs..
25 SDL_FUNC _ConvertMMXpII32_24RGB888
26 SDL_FUNC _ConvertMMXpII32_16RGB565
27 SDL_FUNC _ConvertMMXpII32_16BGR565
28 SDL_FUNC _ConvertMMXpII32_16RGB555
29 SDL_FUNC _ConvertMMXpII32_16BGR555
31 ;; Macros for conversion routines
33 %macro _push_immq_mask 1
48 %define CLEANUP_IMMQ_LOADS(num) \
51 %define mmx32_rgb888_mask 00ffffffh
52 %define mmx32_rgb565_b 000000f8h
53 %define mmx32_rgb565_g 0000fc00h
54 %define mmx32_rgb565_r 00f80000h
56 %define mmx32_rgb555_rb 00f800f8h
57 %define mmx32_rgb555_g 0000f800h
58 %define mmx32_rgb555_mul 20000008h
59 %define mmx32_bgr555_mul 00082000h
63 _ConvertMMXpII32_24RGB888:
65 ; set up mm6 as the mask, mm7 as zero
66 load_immq mm6, mmx32_rgb888_mask
70 mov edx, ecx ; save ecx
71 and ecx, 0fffffffch ; clear lower two bits
77 movq mm0, [esi] ; A R G B a r g b
78 pand mm0, mm6 ; 0 R G B 0 r g b
79 movq mm1, [esi+8] ; A R G B a r g b
80 pand mm1, mm6 ; 0 R G B 0 r g b
82 movq mm2, mm0 ; 0 R G B 0 r g b
83 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
84 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
85 psllq mm2, 24 ; 0 0 R G B 0 0 0
86 por mm0, mm2 ; 0 0 R G B r g b
88 movq mm3, mm1 ; 0 R G B 0 r g b
89 psllq mm3, 48 ; g b 0 0 0 0 0 0
90 por mm0, mm3 ; g b R G B r g b
92 movq mm4, mm1 ; 0 R G B 0 r g b
93 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
94 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
95 psrlq mm1, 16 ; 0 0 0 R G B 0 r
96 psllq mm4, 8 ; 0 0 0 0 R G B 0
97 por mm1, mm4 ; 0 0 0 0 R G B r
126 _ConvertMMXpII32_16RGB565:
129 load_immq mm5, mmx32_rgb565_b
130 load_immq mm6, mmx32_rgb565_g
131 load_immq mm7, mmx32_rgb565_r
132 CLEANUP_IMMQ_LOADS(3)
137 jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
140 movq mm0, [esi] ; argb
146 pslld mm1, 2 ; 0 0 000000bb bbb00000
147 por mm0, mm1 ; 0 0 ggggggbb bbb00000
148 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
150 movq mm4, [esi+8] ; argb
156 pslld mm2, 2 ; 0 0 000000bb bbb00000
157 por mm4, mm2 ; 0 0 ggggggbb bbb00000
158 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
160 packuswb mm3, mm1 ; R 0 r 0
161 packssdw mm0, mm4 ; as above.. ish
179 and eax, 0F81Fh ; BYTE?
181 and ebx, 07E0h ; BYTE?
194 _ConvertMMXpII32_16BGR565:
196 load_immq mm5, mmx32_rgb565_r
197 load_immq mm6, mmx32_rgb565_g
198 load_immq mm7, mmx32_rgb565_b
199 CLEANUP_IMMQ_LOADS(3)
207 movq mm0, [esi] ; a r g b
208 movq mm1, mm0 ; a r g b
209 pand mm0, mm6 ; 0 0 g 0
210 movq mm3, mm1 ; a r g b
211 pand mm1, mm5 ; 0 r 0 0
212 pand mm3, mm7 ; 0 0 0 b
214 psllq mm3, 16 ; 0 b 0 0
215 psrld mm1, 14 ; 0 0 000000rr rrr00000
216 por mm0, mm1 ; 0 0 ggggggrr rrr00000
217 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
219 movq mm4, [esi+8] ; a r g b
220 movq mm2, mm4 ; a r g b
221 pand mm4, mm6 ; 0 0 g 0
222 movq mm1, mm2 ; a r g b
223 pand mm2, mm5 ; 0 r 0 0
224 pand mm1, mm7 ; 0 0 0 b
226 psllq mm1, 16 ; 0 b 0 0
227 psrld mm2, 14 ; 0 0 000000rr rrr00000
228 por mm4, mm2 ; 0 0 ggggggrr rrr00000
229 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
231 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
232 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
233 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
249 and eax, 0F81Fh ; BYTE ?
251 and ebx, 07E0h ; BYTE ?
263 _ConvertMMXpII32_16BGR555:
265 ; the 16BGR555 converter is identical to the RGB555 one,
266 ; except it uses a different multiplier for the pmaddwd
267 ; instruction. cool huh.
269 load_immq mm7, mmx32_bgr555_mul
270 jmp _convert_bgr555_cheat
272 ; This is the same as the Intel version.. they obviously went to
273 ; much more trouble to expand/coil the loop than I did, so theirs
274 ; would almost certainly be faster, even if only a little.
275 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
276 ; (I think) a more accurate name..
277 _ConvertMMXpII32_16RGB555:
279 load_immq mm7, mmx32_rgb555_mul
280 _convert_bgr555_cheat:
281 load_immq mm6, mmx32_rgb555_g
282 CLEANUP_IMMQ_LOADS(2)
284 mov edx,ecx ; Save ecx
286 and ecx,DWORD 0fffffff8h ; clear lower three bits
297 pand_immq mm3, mmx32_rgb555_rb
300 pand_immq mm1, mmx32_rgb555_rb
303 CLEANUP_IMMQ_LOADS(2)
321 pand_immq mm0, mmx32_rgb555_rb
327 pand_immq mm3, mmx32_rgb555_rb
348 pand_immq mm3, mmx32_rgb555_rb
351 pand_immq mm1, mmx32_rgb555_rb
354 CLEANUP_IMMQ_LOADS(4)
383 and eax,BYTE 0000000000011111b
384 and edx, 0000001111100000b
390 and ebx, 0111110000000000b
403 %ifidn __OUTPUT_FORMAT__,elf
404 section .note.GNU-stack noalloc noexec nowrite progbits