SDL-1.2.14
[sdl_omap.git] / src / hermes / x86p_32.asm
CommitLineData
e14743d1 1;
2; x86 format converters for HERMES
3; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
4; This source code is licensed under the GNU LGPL
5;
6; Please refer to the file COPYING.LIB contained in the distribution for
7; licensing conditions
8;
9; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
10;
11
12BITS 32
13
14%include "common.inc"
15
16SDL_FUNC _ConvertX86p32_32BGR888
17SDL_FUNC _ConvertX86p32_32RGBA888
18SDL_FUNC _ConvertX86p32_32BGRA888
19SDL_FUNC _ConvertX86p32_24RGB888
20SDL_FUNC _ConvertX86p32_24BGR888
21SDL_FUNC _ConvertX86p32_16RGB565
22SDL_FUNC _ConvertX86p32_16BGR565
23SDL_FUNC _ConvertX86p32_16RGB555
24SDL_FUNC _ConvertX86p32_16BGR555
25SDL_FUNC _ConvertX86p32_8RGB332
26
27SECTION .text
28
29;; _Convert_*
30;; Paramters:
31;; ESI = source
32;; EDI = dest
33;; ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
34;; Destroys:
35;; EAX, EBX, EDX
36
37
38_ConvertX86p32_32BGR888:
39
40 ; check short
41 cmp ecx,BYTE 32
42 ja .L3
43
44.L1 ; short loop
45 mov edx,[esi]
46 bswap edx
47 ror edx,8
48 mov [edi],edx
49 add esi,BYTE 4
50 add edi,BYTE 4
51 dec ecx
52 jnz .L1
53.L2
54 retn
55
56.L3 ; save ebp
57 push ebp
58
59 ; unroll four times
60 mov ebp,ecx
61 shr ebp,2
62
63 ; save count
64 push ecx
65
66.L4 mov eax,[esi]
67 mov ebx,[esi+4]
68
69 bswap eax
70
71 bswap ebx
72
73 ror eax,8
74 mov ecx,[esi+8]
75
76 ror ebx,8
77 mov edx,[esi+12]
78
79 bswap ecx
80
81 bswap edx
82
83 ror ecx,8
84 mov [edi+0],eax
85
86 ror edx,8
87 mov [edi+4],ebx
88
89 mov [edi+8],ecx
90 mov [edi+12],edx
91
92 add esi,BYTE 16
93 add edi,BYTE 16
94
95 dec ebp
96 jnz .L4
97
98 ; check tail
99 pop ecx
100 and ecx,BYTE 11b
101 jz .L6
102
103.L5 ; tail loop
104 mov edx,[esi]
105 bswap edx
106 ror edx,8
107 mov [edi],edx
108 add esi,BYTE 4
109 add edi,BYTE 4
110 dec ecx
111 jnz .L5
112
113.L6 pop ebp
114 retn
115
116
117
118
119_ConvertX86p32_32RGBA888:
120
121 ; check short
122 cmp ecx,BYTE 32
123 ja .L3
124
125.L1 ; short loop
126 mov edx,[esi]
127 rol edx,8
128 mov [edi],edx
129 add esi,BYTE 4
130 add edi,BYTE 4
131 dec ecx
132 jnz .L1
133.L2
134 retn
135
136.L3 ; save ebp
137 push ebp
138
139 ; unroll four times
140 mov ebp,ecx
141 shr ebp,2
142
143 ; save count
144 push ecx
145
146.L4 mov eax,[esi]
147 mov ebx,[esi+4]
148
149 rol eax,8
150 mov ecx,[esi+8]
151
152 rol ebx,8
153 mov edx,[esi+12]
154
155 rol ecx,8
156 mov [edi+0],eax
157
158 rol edx,8
159 mov [edi+4],ebx
160
161 mov [edi+8],ecx
162 mov [edi+12],edx
163
164 add esi,BYTE 16
165 add edi,BYTE 16
166
167 dec ebp
168 jnz .L4
169
170 ; check tail
171 pop ecx
172 and ecx,BYTE 11b
173 jz .L6
174
175.L5 ; tail loop
176 mov edx,[esi]
177 rol edx,8
178 mov [edi],edx
179 add esi,BYTE 4
180 add edi,BYTE 4
181 dec ecx
182 jnz .L5
183
184.L6 pop ebp
185 retn
186
187
188
189
190_ConvertX86p32_32BGRA888:
191
192 ; check short
193 cmp ecx,BYTE 32
194 ja .L3
195
196.L1 ; short loop
197 mov edx,[esi]
198 bswap edx
199 mov [edi],edx
200 add esi,BYTE 4
201 add edi,BYTE 4
202 dec ecx
203 jnz .L1
204.L2
205 retn
206
207.L3 ; save ebp
208 push ebp
209
210 ; unroll four times
211 mov ebp,ecx
212 shr ebp,2
213
214 ; save count
215 push ecx
216
217.L4 mov eax,[esi]
218 mov ebx,[esi+4]
219
220 mov ecx,[esi+8]
221 mov edx,[esi+12]
222
223 bswap eax
224
225 bswap ebx
226
227 bswap ecx
228
229 bswap edx
230
231 mov [edi+0],eax
232 mov [edi+4],ebx
233
234 mov [edi+8],ecx
235 mov [edi+12],edx
236
237 add esi,BYTE 16
238 add edi,BYTE 16
239
240 dec ebp
241 jnz .L4
242
243 ; check tail
244 pop ecx
245 and ecx,BYTE 11b
246 jz .L6
247
248.L5 ; tail loop
249 mov edx,[esi]
250 bswap edx
251 mov [edi],edx
252 add esi,BYTE 4
253 add edi,BYTE 4
254 dec ecx
255 jnz .L5
256
257.L6 pop ebp
258 retn
259
260
261
262
263;; 32 bit RGB 888 to 24 BIT RGB 888
264
265_ConvertX86p32_24RGB888:
266
267 ; check short
268 cmp ecx,BYTE 32
269 ja .L3
270
271.L1 ; short loop
272 mov al,[esi]
273 mov bl,[esi+1]
274 mov dl,[esi+2]
275 mov [edi],al
276 mov [edi+1],bl
277 mov [edi+2],dl
278 add esi,BYTE 4
279 add edi,BYTE 3
280 dec ecx
281 jnz .L1
282.L2
283 retn
284
285.L3 ; head
286 mov edx,edi
287 and edx,BYTE 11b
288 jz .L4
289 mov al,[esi]
290 mov bl,[esi+1]
291 mov dl,[esi+2]
292 mov [edi],al
293 mov [edi+1],bl
294 mov [edi+2],dl
295 add esi,BYTE 4
296 add edi,BYTE 3
297 dec ecx
298 jmp SHORT .L3
299
300.L4 ; unroll 4 times
301 push ebp
302 mov ebp,ecx
303 shr ebp,2
304
305 ; save count
306 push ecx
307
308.L5 mov eax,[esi] ; first dword eax = [A][R][G][B]
309 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b]
310
311 shl eax,8 ; eax = [R][G][B][.]
312 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b]
313
314 shl ebx,8 ; ebx = [r][g][b][.]
315 mov al,[esi+4] ; eax = [R][G][B][b]
316
317 ror eax,8 ; eax = [b][R][G][B] (done)
318 mov bh,[esi+8+1] ; ebx = [r][g][G][.]
319
320 mov [edi],eax
321 add edi,BYTE 3*4
322
323 shl ecx,8 ; ecx = [r][g][b][.]
324 mov bl,[esi+8+0] ; ebx = [r][g][G][B]
325
326 rol ebx,16 ; ebx = [G][B][r][g] (done)
327 mov cl,[esi+8+2] ; ecx = [r][g][b][R] (done)
328
329 mov [edi+4-3*4],ebx
330 add esi,BYTE 4*4
331
332 mov [edi+8-3*4],ecx
333 dec ebp
334
335 jnz .L5
336
337 ; check tail
338 pop ecx
339 and ecx,BYTE 11b
340 jz .L7
341
342.L6 ; tail loop
343 mov al,[esi]
344 mov bl,[esi+1]
345 mov dl,[esi+2]
346 mov [edi],al
347 mov [edi+1],bl
348 mov [edi+2],dl
349 add esi,BYTE 4
350 add edi,BYTE 3
351 dec ecx
352 jnz .L6
353
354.L7 pop ebp
355 retn
356
357
358
359
360;; 32 bit RGB 888 to 24 bit BGR 888
361
362_ConvertX86p32_24BGR888:
363
364 ; check short
365 cmp ecx,BYTE 32
366 ja .L3
367
368
369.L1 ; short loop
370 mov dl,[esi]
371 mov bl,[esi+1]
372 mov al,[esi+2]
373 mov [edi],al
374 mov [edi+1],bl
375 mov [edi+2],dl
376 add esi,BYTE 4
377 add edi,BYTE 3
378 dec ecx
379 jnz .L1
380.L2
381 retn
382
383.L3 ; head
384 mov edx,edi
385 and edx,BYTE 11b
386 jz .L4
387 mov dl,[esi]
388 mov bl,[esi+1]
389 mov al,[esi+2]
390 mov [edi],al
391 mov [edi+1],bl
392 mov [edi+2],dl
393 add esi,BYTE 4
394 add edi,BYTE 3
395 dec ecx
396 jmp SHORT .L3
397
398.L4 ; unroll 4 times
399 push ebp
400 mov ebp,ecx
401 shr ebp,2
402
403 ; save count
404 push ecx
405
406.L5
407 mov eax,[esi] ; first dword eax = [A][R][G][B]
408 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b]
409
410 bswap eax ; eax = [B][G][R][A]
411
412 bswap ebx ; ebx = [b][g][r][a]
413
414 mov al,[esi+4+2] ; eax = [B][G][R][r]
415 mov bh,[esi+4+4+1] ; ebx = [b][g][G][a]
416
417 ror eax,8 ; eax = [r][B][G][R] (done)
418 mov bl,[esi+4+4+2] ; ebx = [b][g][G][R]
419
420 ror ebx,16 ; ebx = [G][R][b][g] (done)
421 mov [edi],eax
422
423 mov [edi+4],ebx
424 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b]
425
426 bswap ecx ; ecx = [b][g][r][a]
427
428 mov cl,[esi+8] ; ecx = [b][g][r][B] (done)
429 add esi,BYTE 4*4
430
431 mov [edi+8],ecx
432 add edi,BYTE 3*4
433
434 dec ebp
435 jnz .L5
436
437 ; check tail
438 pop ecx
439 and ecx,BYTE 11b
440 jz .L7
441
442.L6 ; tail loop
443 mov dl,[esi]
444 mov bl,[esi+1]
445 mov al,[esi+2]
446 mov [edi],al
447 mov [edi+1],bl
448 mov [edi+2],dl
449 add esi,BYTE 4
450 add edi,BYTE 3
451 dec ecx
452 jnz .L6
453
454.L7
455 pop ebp
456 retn
457
458
459
460
461;; 32 bit RGB 888 to 16 BIT RGB 565
462
463_ConvertX86p32_16RGB565:
464 ; check short
465 cmp ecx,BYTE 16
466 ja .L3
467
468.L1 ; short loop
469 mov bl,[esi+0] ; blue
470 mov al,[esi+1] ; green
471 mov ah,[esi+2] ; red
472 shr ah,3
473 and al,11111100b
474 shl eax,3
475 shr bl,3
476 add al,bl
477 mov [edi+0],al
478 mov [edi+1],ah
479 add esi,BYTE 4
480 add edi,BYTE 2
481 dec ecx
482 jnz .L1
483
484.L2: ; End of short loop
485 retn
486
487
488.L3 ; head
489 mov ebx,edi
490 and ebx,BYTE 11b
491 jz .L4
492
493 mov bl,[esi+0] ; blue
494 mov al,[esi+1] ; green
495 mov ah,[esi+2] ; red
496 shr ah,3
497 and al,11111100b
498 shl eax,3
499 shr bl,3
500 add al,bl
501 mov [edi+0],al
502 mov [edi+1],ah
503 add esi,BYTE 4
504 add edi,BYTE 2
505 dec ecx
506
507.L4:
508 ; save count
509 push ecx
510
511 ; unroll twice
512 shr ecx,1
513
514 ; point arrays to end
515 lea esi,[esi+ecx*8]
516 lea edi,[edi+ecx*4]
517
518 ; negative counter
519 neg ecx
520 jmp SHORT .L6
521
522.L5:
523 mov [edi+ecx*4-4],eax
524.L6:
525 mov eax,[esi+ecx*8]
526
527 shr ah,2
528 mov ebx,[esi+ecx*8+4]
529
530 shr eax,3
531 mov edx,[esi+ecx*8+4]
532
533 shr bh,2
534 mov dl,[esi+ecx*8+2]
535
536 shl ebx,13
537 and eax,000007FFh
538
539 shl edx,8
540 and ebx,07FF0000h
541
542 and edx,0F800F800h
543 add eax,ebx
544
545 add eax,edx
546 inc ecx
547
548 jnz .L5
549
550 mov [edi+ecx*4-4],eax
551
552 ; tail
553 pop ecx
554 test cl,1
555 jz .L7
556
557 mov bl,[esi+0] ; blue
558 mov al,[esi+1] ; green
559 mov ah,[esi+2] ; red
560 shr ah,3
561 and al,11111100b
562 shl eax,3
563 shr bl,3
564 add al,bl
565 mov [edi+0],al
566 mov [edi+1],ah
567 add esi,BYTE 4
568 add edi,BYTE 2
569
570.L7:
571 retn
572
573
574
575
576;; 32 bit RGB 888 to 16 BIT BGR 565
577
578_ConvertX86p32_16BGR565:
579
580 ; check short
581 cmp ecx,BYTE 16
582 ja .L3
583
584.L1 ; short loop
585 mov ah,[esi+0] ; blue
586 mov al,[esi+1] ; green
587 mov bl,[esi+2] ; red
588 shr ah,3
589 and al,11111100b
590 shl eax,3
591 shr bl,3
592 add al,bl
593 mov [edi+0],al
594 mov [edi+1],ah
595 add esi,BYTE 4
596 add edi,BYTE 2
597 dec ecx
598 jnz .L1
599.L2
600 retn
601
602.L3 ; head
603 mov ebx,edi
604 and ebx,BYTE 11b
605 jz .L4
606 mov ah,[esi+0] ; blue
607 mov al,[esi+1] ; green
608 mov bl,[esi+2] ; red
609 shr ah,3
610 and al,11111100b
611 shl eax,3
612 shr bl,3
613 add al,bl
614 mov [edi+0],al
615 mov [edi+1],ah
616 add esi,BYTE 4
617 add edi,BYTE 2
618 dec ecx
619
620.L4 ; save count
621 push ecx
622
623 ; unroll twice
624 shr ecx,1
625
626 ; point arrays to end
627 lea esi,[esi+ecx*8]
628 lea edi,[edi+ecx*4]
629
630 ; negative count
631 neg ecx
632 jmp SHORT .L6
633
634.L5
635 mov [edi+ecx*4-4],eax
636.L6
637 mov edx,[esi+ecx*8+4]
638
639 mov bh,[esi+ecx*8+4]
640 mov ah,[esi+ecx*8]
641
642 shr bh,3
643 mov al,[esi+ecx*8+1]
644
645 shr ah,3
646 mov bl,[esi+ecx*8+5]
647
648 shl eax,3
649 mov dl,[esi+ecx*8+2]
650
651 shl ebx,19
652 and eax,0000FFE0h
653
654 shr edx,3
655 and ebx,0FFE00000h
656
657 and edx,001F001Fh
658 add eax,ebx
659
660 add eax,edx
661 inc ecx
662
663 jnz .L5
664
665 mov [edi+ecx*4-4],eax
666
667 ; tail
668 pop ecx
669 and ecx,BYTE 1
670 jz .L7
671 mov ah,[esi+0] ; blue
672 mov al,[esi+1] ; green
673 mov bl,[esi+2] ; red
674 shr ah,3
675 and al,11111100b
676 shl eax,3
677 shr bl,3
678 add al,bl
679 mov [edi+0],al
680 mov [edi+1],ah
681 add esi,BYTE 4
682 add edi,BYTE 2
683
684.L7
685 retn
686
687
688
689
690;; 32 BIT RGB TO 16 BIT RGB 555
691
692_ConvertX86p32_16RGB555:
693
694 ; check short
695 cmp ecx,BYTE 16
696 ja .L3
697
698.L1 ; short loop
699 mov bl,[esi+0] ; blue
700 mov al,[esi+1] ; green
701 mov ah,[esi+2] ; red
702 shr ah,3
703 and al,11111000b
704 shl eax,2
705 shr bl,3
706 add al,bl
707 mov [edi+0],al
708 mov [edi+1],ah
709 add esi,BYTE 4
710 add edi,BYTE 2
711 dec ecx
712 jnz .L1
713.L2
714 retn
715
716.L3 ; head
717 mov ebx,edi
718 and ebx,BYTE 11b
719 jz .L4
720 mov bl,[esi+0] ; blue
721 mov al,[esi+1] ; green
722 mov ah,[esi+2] ; red
723 shr ah,3
724 and al,11111000b
725 shl eax,2
726 shr bl,3
727 add al,bl
728 mov [edi+0],al
729 mov [edi+1],ah
730 add esi,BYTE 4
731 add edi,BYTE 2
732 dec ecx
733
734.L4 ; save count
735 push ecx
736
737 ; unroll twice
738 shr ecx,1
739
740 ; point arrays to end
741 lea esi,[esi+ecx*8]
742 lea edi,[edi+ecx*4]
743
744 ; negative counter
745 neg ecx
746 jmp SHORT .L6
747
748.L5
749 mov [edi+ecx*4-4],eax
750.L6
751 mov eax,[esi+ecx*8]
752
753 shr ah,3
754 mov ebx,[esi+ecx*8+4]
755
756 shr eax,3
757 mov edx,[esi+ecx*8+4]
758
759 shr bh,3
760 mov dl,[esi+ecx*8+2]
761
762 shl ebx,13
763 and eax,000007FFh
764
765 shl edx,7
766 and ebx,07FF0000h
767
768 and edx,07C007C00h
769 add eax,ebx
770
771 add eax,edx
772 inc ecx
773
774 jnz .L5
775
776 mov [edi+ecx*4-4],eax
777
778 ; tail
779 pop ecx
780 and ecx,BYTE 1
781 jz .L7
782 mov bl,[esi+0] ; blue
783 mov al,[esi+1] ; green
784 mov ah,[esi+2] ; red
785 shr ah,3
786 and al,11111000b
787 shl eax,2
788 shr bl,3
789 add al,bl
790 mov [edi+0],al
791 mov [edi+1],ah
792 add esi,BYTE 4
793 add edi,BYTE 2
794
795.L7
796 retn
797
798
799
800
801;; 32 BIT RGB TO 16 BIT BGR 555
802
803_ConvertX86p32_16BGR555:
804
805 ; check short
806 cmp ecx,BYTE 16
807 ja .L3
808
809
810.L1 ; short loop
811 mov ah,[esi+0] ; blue
812 mov al,[esi+1] ; green
813 mov bl,[esi+2] ; red
814 shr ah,3
815 and al,11111000b
816 shl eax,2
817 shr bl,3
818 add al,bl
819 mov [edi+0],al
820 mov [edi+1],ah
821 add esi,BYTE 4
822 add edi,BYTE 2
823 dec ecx
824 jnz .L1
825.L2
826 retn
827
828.L3 ; head
829 mov ebx,edi
830 and ebx,BYTE 11b
831 jz .L4
832 mov ah,[esi+0] ; blue
833 mov al,[esi+1] ; green
834 mov bl,[esi+2] ; red
835 shr ah,3
836 and al,11111000b
837 shl eax,2
838 shr bl,3
839 add al,bl
840 mov [edi+0],al
841 mov [edi+1],ah
842 add esi,BYTE 4
843 add edi,BYTE 2
844 dec ecx
845
846.L4 ; save count
847 push ecx
848
849 ; unroll twice
850 shr ecx,1
851
852 ; point arrays to end
853 lea esi,[esi+ecx*8]
854 lea edi,[edi+ecx*4]
855
856 ; negative counter
857 neg ecx
858 jmp SHORT .L6
859
860.L5
861 mov [edi+ecx*4-4],eax
862.L6
863 mov edx,[esi+ecx*8+4]
864
865 mov bh,[esi+ecx*8+4]
866 mov ah,[esi+ecx*8]
867
868 shr bh,3
869 mov al,[esi+ecx*8+1]
870
871 shr ah,3
872 mov bl,[esi+ecx*8+5]
873
874 shl eax,2
875 mov dl,[esi+ecx*8+2]
876
877 shl ebx,18
878 and eax,00007FE0h
879
880 shr edx,3
881 and ebx,07FE00000h
882
883 and edx,001F001Fh
884 add eax,ebx
885
886 add eax,edx
887 inc ecx
888
889 jnz .L5
890
891 mov [edi+ecx*4-4],eax
892
893 ; tail
894 pop ecx
895 and ecx,BYTE 1
896 jz .L7
897 mov ah,[esi+0] ; blue
898 mov al,[esi+1] ; green
899 mov bl,[esi+2] ; red
900 shr ah,3
901 and al,11111000b
902 shl eax,2
903 shr bl,3
904 add al,bl
905 mov [edi+0],al
906 mov [edi+1],ah
907 add esi,BYTE 4
908 add edi,BYTE 2
909
910.L7
911 retn
912
913
914
915
916
917;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
918;; This routine writes FOUR pixels at once (dword) and then, if they exist
919;; the trailing three pixels
920_ConvertX86p32_8RGB332:
921
922
923.L_ALIGNED
924 push ecx
925
926 shr ecx,2 ; We will draw 4 pixels at once
927 jnz .L1
928
929 jmp .L2 ; short jump out of range :(
930
931.L1:
932 mov eax,[esi] ; first pair of pixels
933 mov edx,[esi+4]
934
935 shr dl,6
936 mov ebx,eax
937
938 shr al,6
939 and ah,0e0h
940
941 shr ebx,16
942 and dh,0e0h
943
944 shr ah,3
945 and bl,0e0h
946
947 shr dh,3
948
949 or al,bl
950
951 mov ebx,edx
952 or al,ah
953
954 shr ebx,16
955 or dl,dh
956
957 and bl,0e0h
958
959 or dl,bl
960
961 mov ah,dl
962
963
964
965 mov ebx,[esi+8] ; second pair of pixels
966
967 mov edx,ebx
968 and bh,0e0h
969
970 shr bl,6
971 and edx,0e00000h
972
973 shr edx,16
974
975 shr bh,3
976
977 ror eax,16
978 or bl,dl
979
980 mov edx,[esi+12]
981 or bl,bh
982
983 mov al,bl
984
985 mov ebx,edx
986 and dh,0e0h
987
988 shr dl,6
989 and ebx,0e00000h
990
991 shr dh,3
992 mov ah,dl
993
994 shr ebx,16
995 or ah,dh
996
997 or ah,bl
998
999 rol eax,16
1000 add esi,BYTE 16
1001
1002 mov [edi],eax
1003 add edi,BYTE 4
1004
1005 dec ecx
1006 jz .L2 ; L1 out of range for short jump :(
1007
1008 jmp .L1
1009.L2:
1010
1011 pop ecx
1012 and ecx,BYTE 3 ; mask out number of pixels to draw
1013
1014 jz .L4 ; Nothing to do anymore
1015
1016.L3:
1017 mov eax,[esi] ; single pixel conversion for trailing pixels
1018
1019 mov ebx,eax
1020
1021 shr al,6
1022 and ah,0e0h
1023
1024 shr ebx,16
1025
1026 shr ah,3
1027 and bl,0e0h
1028
1029 or al,ah
1030 or al,bl
1031
1032 mov [edi],al
1033
1034 inc edi
1035 add esi,BYTE 4
1036
1037 dec ecx
1038 jnz .L3
1039
1040.L4:
1041 retn
1042
1043%ifidn __OUTPUT_FORMAT__,elf
1044section .note.GNU-stack noalloc noexec nowrite progbits
1045%endif