src/hermes/mmxp2_32.asm

   1 ;
   2 ; pII-optimised MMX format converters for HERMES
   3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
   4 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
   5 ; This source code is licensed under the GNU LGPL
   6 ;
   7 ; Please refer to the file COPYING.LIB contained in the distribution for
   8 ; licensing conditions
   9 ;
  10 ; COPYRIGHT NOTICE
  11 ;
  12 ; This file partly contains code that is (c) Intel Corporation, specifically
  13 ; the mode detection routine, and the converter to 15 bit (8 pixel
  14 ; conversion routine from the mmx programming tutorial pages).
  15 ;
  16 ;
  17 ; These routines aren't exactly pII optimised - it's just that as they
  18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
  19 ; optimise them for p5 MMXs..
  20
  21 BITS 32
  22
  23 %include "common.inc"
  24
  25 SDL_FUNC _ConvertMMXpII32_24RGB888
  26 SDL_FUNC _ConvertMMXpII32_16RGB565
  27 SDL_FUNC _ConvertMMXpII32_16BGR565
  28 SDL_FUNC _ConvertMMXpII32_16RGB555
  29 SDL_FUNC _ConvertMMXpII32_16BGR555
  30
  31 ;; Macros for conversion routines
  32
  33 %macro _push_immq_mask 1
  34         push dword %1
  35         push dword %1
  36 %endmacro
  37
  38 %macro load_immq 2
  39         _push_immq_mask %2
  40         movq %1, [esp]
  41 %endmacro
  42
  43 %macro pand_immq 2
  44         _push_immq_mask %2
  45         pand %1, [esp]
  46 %endmacro
  47
  48 %define CLEANUP_IMMQ_LOADS(num) \
  49         add esp, byte 8 * num
  50
  51 %define mmx32_rgb888_mask 00ffffffh
  52 %define mmx32_rgb565_b 000000f8h
  53 %define mmx32_rgb565_g 0000fc00h
  54 %define mmx32_rgb565_r 00f80000h
  55
  56 %define mmx32_rgb555_rb 00f800f8h
  57 %define mmx32_rgb555_g 0000f800h
  58 %define mmx32_rgb555_mul 20000008h
  59 %define mmx32_bgr555_mul 00082000h
  60
  61 SECTION .text
  62
  63 _ConvertMMXpII32_24RGB888:
  64
  65         ; set up mm6 as the mask, mm7 as zero
  66         load_immq mm6, mmx32_rgb888_mask
  67         CLEANUP_IMMQ_LOADS(1)
  68         pxor mm7, mm7
  69
  70         mov edx, ecx                    ; save ecx
  71         and ecx, 0fffffffch             ; clear lower two bits
  72         jnz .L1
  73         jmp .L2
  74
  75 .L1:
  76
  77         movq mm0, [esi]                 ; A R G B a r g b
  78         pand mm0, mm6                   ; 0 R G B 0 r g b
  79         movq mm1, [esi+8]               ; A R G B a r g b
  80         pand mm1, mm6                   ; 0 R G B 0 r g b
  81
  82         movq mm2, mm0                   ; 0 R G B 0 r g b
  83         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
  84         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
  85         psllq mm2, 24                   ; 0 0 R G B 0 0 0
  86         por mm0, mm2                    ; 0 0 R G B r g b
  87
  88         movq mm3, mm1                   ; 0 R G B 0 r g b
  89         psllq mm3, 48                   ; g b 0 0 0 0 0 0
  90         por mm0, mm3                    ; g b R G B r g b
  91
  92         movq mm4, mm1                   ; 0 R G B 0 r g b
  93         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
  94         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
  95         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
  96         psllq mm4, 8                    ; 0 0 0 0 R G B 0
  97         por mm1, mm4                    ; 0 0 0 0 R G B r
  98
  99         movq [edi], mm0
 100         add esi, BYTE 16
 101         movd [edi+8], mm1
 102         add edi, BYTE 12
 103         sub ecx, BYTE 4
 104         jnz .L1
 105
 106 .L2:
 107         mov ecx, edx
 108         and ecx, BYTE 3
 109         jz .L4
 110 .L3:
 111         mov al, [esi]
 112         mov bl, [esi+1]
 113         mov dl, [esi+2]
 114         mov [edi], al
 115         mov [edi+1], bl
 116         mov [edi+2], dl
 117         add esi, BYTE 4
 118         add edi, BYTE 3
 119         dec ecx
 120         jnz .L3
 121 .L4:
 122         return
 123
 124
 125
 126 _ConvertMMXpII32_16RGB565:
 127
 128         ; set up masks
 129         load_immq mm5, mmx32_rgb565_b
 130         load_immq mm6, mmx32_rgb565_g
 131         load_immq mm7, mmx32_rgb565_r
 132         CLEANUP_IMMQ_LOADS(3)
 133
 134         mov edx, ecx
 135         shr ecx, 2
 136         jnz .L1
 137         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
 138
 139 .L1:
 140         movq mm0, [esi]         ; argb
 141         movq mm1, mm0           ; argb
 142         pand mm0, mm6           ; 00g0
 143         movq mm3, mm1           ; argb
 144         pand mm1, mm5           ; 000b
 145         pand mm3, mm7           ; 0r00
 146         pslld mm1, 2            ; 0 0 000000bb bbb00000
 147         por mm0, mm1            ; 0 0 ggggggbb bbb00000
 148         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
 149
 150         movq mm4, [esi+8]       ; argb
 151         movq mm2, mm4           ; argb
 152         pand mm4, mm6           ; 00g0
 153         movq mm1, mm2           ; argb
 154         pand mm2, mm5           ; 000b
 155         pand mm1, mm7           ; 0r00
 156         pslld mm2, 2            ; 0 0 000000bb bbb00000
 157         por mm4, mm2            ; 0 0 ggggggbb bbb00000
 158         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
 159
 160         packuswb mm3, mm1       ; R 0 r 0
 161         packssdw mm0, mm4       ; as above.. ish
 162         por mm0, mm3            ; done.
 163         movq [edi], mm0
 164
 165         add esi, 16
 166         add edi, 8
 167         dec ecx
 168         jnz .L1
 169
 170 .L2:
 171         mov ecx, edx
 172         and ecx, BYTE 3
 173         jz .L4
 174 .L3:
 175         mov al, [esi]
 176         mov bh, [esi+1]
 177         mov ah, [esi+2]
 178         shr al, 3
 179         and eax, 0F81Fh            ; BYTE?
 180         shr ebx, 5
 181         and ebx, 07E0h             ; BYTE?
 182         add eax, ebx
 183         mov [edi], al
 184         mov [edi+1], ah
 185         add esi, BYTE 4
 186         add edi, BYTE 2
 187         dec ecx
 188         jnz .L3
 189
 190 .L4:
 191         retn
 192
 193
 194 _ConvertMMXpII32_16BGR565:
 195
 196         load_immq mm5, mmx32_rgb565_r
 197         load_immq mm6, mmx32_rgb565_g
 198         load_immq mm7, mmx32_rgb565_b
 199         CLEANUP_IMMQ_LOADS(3)
 200
 201         mov edx, ecx
 202         shr ecx, 2
 203         jnz .L1
 204         jmp .L2
 205
 206 .L1:
 207         movq mm0, [esi]                 ; a r g b
 208         movq mm1, mm0                   ; a r g b
 209         pand mm0, mm6                   ; 0 0 g 0
 210         movq mm3, mm1                   ; a r g b
 211         pand mm1, mm5                   ; 0 r 0 0
 212         pand mm3, mm7                   ; 0 0 0 b
 213
 214         psllq mm3, 16                   ; 0 b 0 0
 215         psrld mm1, 14                   ; 0 0 000000rr rrr00000
 216         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
 217         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
 218
 219         movq mm4, [esi+8]               ; a r g b
 220         movq mm2, mm4                   ; a r g b
 221         pand mm4, mm6                   ; 0 0 g 0
 222         movq mm1, mm2                   ; a r g b
 223         pand mm2, mm5                   ; 0 r 0 0
 224         pand mm1, mm7                   ; 0 0 0 b
 225
 226         psllq mm1, 16                   ; 0 b 0 0
 227         psrld mm2, 14                   ; 0 0 000000rr rrr00000
 228         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
 229         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
 230
 231         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
 232         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
 233         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
 234         movq [edi], mm0
 235
 236         add esi, BYTE 16
 237         add edi, BYTE 8
 238         dec ecx
 239         jnz .L1
 240
 241 .L2:
 242         and edx, BYTE 3
 243         jz .L4
 244 .L3:
 245         mov al, [esi+2]
 246         mov bh, [esi+1]
 247         mov ah, [esi]
 248         shr al, 3
 249         and eax, 0F81Fh                    ; BYTE ?
 250         shr ebx, 5
 251         and ebx, 07E0h                     ; BYTE ?
 252         add eax, ebx
 253         mov [edi], al
 254         mov [edi+1], ah
 255         add esi, BYTE 4
 256         add edi, BYTE 2
 257         dec edx
 258         jnz .L3
 259
 260 .L4:
 261         retn
 262
 263 _ConvertMMXpII32_16BGR555:
 264
 265         ; the 16BGR555 converter is identical to the RGB555 one,
 266         ; except it uses a different multiplier for the pmaddwd
 267         ; instruction.  cool huh.
 268
 269         load_immq mm7, mmx32_bgr555_mul
 270         jmp _convert_bgr555_cheat
 271
 272 ; This is the same as the Intel version.. they obviously went to
 273 ; much more trouble to expand/coil the loop than I did, so theirs
 274 ; would almost certainly be faster, even if only a little.
 275 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
 276 ; (I think) a more accurate name..
 277 _ConvertMMXpII32_16RGB555:
 278
 279         load_immq mm7, mmx32_rgb555_mul
 280 _convert_bgr555_cheat:
 281         load_immq mm6, mmx32_rgb555_g
 282         CLEANUP_IMMQ_LOADS(2)
 283
 284         mov edx,ecx                        ; Save ecx
 285
 286         and ecx,DWORD 0fffffff8h            ; clear lower three bits
 287         jnz .L_OK
 288         jmp near .L2
 289
 290 .L_OK:
 291
 292         movq mm2,[esi+8]
 293
 294         movq mm0,[esi]
 295         movq mm3,mm2
 296
 297         pand_immq mm3, mmx32_rgb555_rb
 298         movq mm1,mm0
 299
 300         pand_immq mm1, mmx32_rgb555_rb
 301         pmaddwd mm3,mm7
 302
 303         CLEANUP_IMMQ_LOADS(2)
 304
 305         pmaddwd mm1,mm7
 306         pand mm2,mm6
 307
 308 .L1:
 309         movq mm4,[esi+24]
 310         pand mm0,mm6
 311
 312         movq mm5,[esi+16]
 313         por mm3,mm2
 314
 315         psrld mm3,6
 316         por mm1,mm0
 317
 318         movq mm0,mm4
 319         psrld mm1,6
 320
 321         pand_immq mm0, mmx32_rgb555_rb
 322         packssdw mm1,mm3
 323
 324         movq mm3,mm5
 325         pmaddwd mm0,mm7
 326
 327         pand_immq mm3, mmx32_rgb555_rb
 328         pand mm4,mm6
 329
 330         movq [edi],mm1
 331         pmaddwd mm3,mm7
 332
 333         add esi,BYTE 32
 334         por mm4,mm0
 335
 336         pand mm5,mm6
 337         psrld mm4,6
 338
 339         movq mm2,[esi+8]
 340         por mm5,mm3
 341
 342         movq mm0,[esi]
 343         psrld mm5,6
 344
 345         movq mm3,mm2
 346         movq mm1,mm0
 347
 348         pand_immq mm3, mmx32_rgb555_rb
 349         packssdw mm5,mm4
 350
 351         pand_immq mm1, mmx32_rgb555_rb
 352         pand mm2,mm6
 353
 354         CLEANUP_IMMQ_LOADS(4)
 355
 356         movq [edi+8],mm5
 357         pmaddwd mm3,mm7
 358
 359         pmaddwd mm1,mm7
 360         add edi,BYTE 16
 361
 362         sub ecx,BYTE 8
 363         jz .L2
 364         jmp .L1
 365
 366
 367 .L2:
 368         mov ecx,edx
 369
 370         and ecx,BYTE 7
 371         jz .L4
 372
 373 .L3:
 374         mov ebx,[esi]
 375         add esi,BYTE 4
 376
 377         mov eax,ebx
 378         mov edx,ebx
 379
 380         shr eax,3
 381         shr edx,6
 382
 383         and eax,BYTE 0000000000011111b
 384         and edx,     0000001111100000b
 385
 386         shr ebx,9
 387
 388         or eax,edx
 389
 390         and ebx,     0111110000000000b
 391
 392         or eax,ebx
 393
 394         mov [edi],ax
 395         add edi,BYTE 2
 396
 397         dec ecx
 398         jnz .L3
 399
 400 .L4:
 401         retn
 402
 403 %ifidn __OUTPUT_FORMAT__,elf
 404 section .note.GNU-stack noalloc noexec nowrite progbits
 405 %endif