@ vim:filetype=armasm
+@ Assembly optimized routines for gpfce - FCE Ultra port
+@ (c) Copyright 2007, Grazvydas "notaz" Ignotas
+
@ test
.global flushcache @ beginning_addr, end_addr, flags
add r0, r4, r12,lsl #1 @ r0 = dst_end
add r1, r5, r12 @ r1 = src_end
+ mov r2, r11
+
soft_scale_loop:
sub r1, r1, #64 @ skip borders
- mov r2, #256/8
+ orr r2, r2, #(256/8-1)<<24
soft_scale_loop_line:
ldr r12, [r1, #-8]!
ldrh r4, [r3, r4]
and r5, lr, r12,lsr #7
ldrh r5, [r3, r5]
- and r4, r4, r9, lsl #2
- orr r4, r4, r4, lsl #14 @ r4[31:16] = 1/4 pix_s 0
- and r5, r5, r9, lsl #2
- sub r6, r5, r5, lsr #2 @ r6 = 3/4 pix_s 1
+ and r11,r4, r9, lsl #2
+ orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0
+ and r11,r5, r9, lsl #2
+ sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1
add r4, r4, r6, lsl #16 @ pix_d 0, 1
and r6, lr, r12,lsr #15
ldrh r6, [r3, r6]
and r12,lr, r12,lsr #23
ldrh r12,[r3, r12]
- and r6, r6, r9, lsl #2
- add r5, r5, r6
+ bic r11,r6, #0x0820
+ bic r5, r5, #0x0820
+ add r5, r5, r11
mov r5, r5, lsr #1
- sub r6, r6, r6, lsr #2 @ r6 = 3/4 pix_s 2
+ and r11,r6, r9, lsl #2
+ sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2
orr r5, r5, r6, lsl #16
and r6, lr, r7, lsl #1
ldrh r6, [r3, r6]
- and r12,r12,r9, lsl #2
- add r5, r5, r12,lsl #14 @ pix_d 2, 3
- and r6, r6, r9, lsl #2
+ and r11,r12,r9, lsl #2
+ add r5, r5, r11,lsl #14 @ pix_d 2, 3
orr r6, r12,r6, lsl #16 @ pix_d 4, 5
and r12,lr, r7, lsr #7
ldrh r12,[r3, r12]
and r10,lr, r7, lsr #15
ldrh r10,[r3, r10]
- and r12,r12,r9, lsl #2
- sub r8, r12,r12,lsr #2 @ r8 = 3/4 pix_s 1
- add r8, r8, r6, lsr #18
+ and r11,r12,r9, lsl #2
+ sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1
+ and r11,r6, r9, lsl #18
+ add r8, r8, r11,lsr #18
+ mov r8, r8, lsl #16
and r7, lr, r7, lsr #23
ldrh r7, [r3, r7]
- and r10,r10,r9, lsl #2
- orr r8, r8, r10,lsl #15
- add r8, r8, r12,lsl #15 @ pix_d 6, 7
- sub r10,r10,r10,lsr #2 @ r10= 3/4 pix_s 2
- and r7, r7, r9, lsl #2
- add r10,r10,r7, lsr #2 @ += 1/4 pix_s 3
+ bic r11,r10,#0x0820
+ bic r12,r12,#0x0820
+ add r12,r12,r11
+ add r8, r8, r12,lsr #1 @ pix_d 6, 7
+ mov r8, r8, ror #16
+ and r11,r10,r9, lsl #2
+ sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2
+ and r11,r7, r9, lsl #2
+ add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3
orr r10,r10,r7, lsl #16 @ pix_d 8, 9
- subs r2, r2, #1
+ subs r2, r2, #1<<24
stmdb r0!, {r4,r5,r6,r8,r10}
- bne soft_scale_loop_line
+ bpl soft_scale_loop_line
- subs r11,r11,#1
+ add r2, r2, #1<<24
+ subs r2, r2, #1
bne soft_scale_loop
ldmfd sp!,{r4-r11,lr}
bx lr
-/* buggy and slow, probably because function call overhead
-@ renderer helper, based on bitbank's method
-.global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal
-
-draw8pix:
- stmfd sp!, {r4,r5}
-
- ldrb r3, [r1] @ get bit 0 pixels
- mov r12,#1
- orr r12,r12,r12,lsl #8
- orr r12,r12,r12,lsl #16
- ldrb r1, [r1, #8] @ get bit 1 pixels
- orr r3, r3, r3, lsl #9 @ shift them over 1 byte + 1 bit
- orr r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
- and r4, r12,r3, lsr #7 @ mask off the upper nibble pixels we want
- and r5, r12,r3, lsr #3 @ mask off the lower nibble pixels we want
- ldr r2, [r2]
-
- orr r1, r1, r1, lsl #9 @ process the bit 1 pixels
- orr r1, r1, r1, lsl #18
- and r3, r12,r1, lsr #7 @ mask off the upper nibble pixels we want
- and r1, r12,r1, lsr #3 @ mask off the lower nibble
- orr r4, r4, r3, lsl #1
- orr r5, r5, r1, lsl #5
-
- @ can this be avoided?
- mov r4, r4, lsl #3 @ *8
- mov r3, r2, ror r4
- strb r3, [r0], #1
- mov r4, r4, lsr #8
- mov r3, r2, ror r4
- strb r3, [r0], #1
- mov r4, r4, lsr #8
- mov r3, r2, ror r4
- strb r3, [r0], #1
- mov r4, r4, lsr #8
- mov r3, r2, ror r4
- strb r3, [r0], #1
-
- mov r5, r5, lsl #3 @ *8
- mov r3, r2, ror r5
- strb r3, [r0], #1
- mov r5, r5, lsr #8
- mov r3, r2, ror r5
- strb r3, [r0], #1
- mov r5, r5, lsr #8
- mov r3, r2, ror r5
- strb r3, [r0], #1
- mov r5, r5, lsr #8
- mov r3, r2, ror r5
- strb r3, [r0], #1
-
- ldmfd sp!, {r4,r5}
- bx lr
-*/
+@ void convert2RGB555(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);
+
+.global convert2RGB555
+
+convert2RGB555:
+ stmfd sp!,{r4-r8,lr}
+ mov lr, #0xff
+ mov lr, lr, lsl #1
+
+ mov r3, r3, lsr #3
+
+convert2RGB555_loop:
+ ldmia r1!,{r4,r5}
+
+ and r6, lr, r4, lsl #1
+ ldrh r6, [r2, r6]
+ and r7, lr, r4, lsr #7
+ ldrh r7, [r2, r7]
+ and r8, lr, r4, lsr #15
+ ldrh r8, [r2, r8]
+ and r4, lr, r4, lsr #23
+ ldrh r4, [r2, r4]
+
+ orr r6, r6, r7, lsl #16
+ and r12,lr, r5, lsl #1
+ ldrh r12, [r2, r12]
+ orr r7, r8, r4, lsl #16
+ and r8, lr, r5, lsr #7
+ ldrh r8, [r2, r8]
+ and r4, lr, r5, lsr #15
+ ldrh r4, [r2, r4]
+ and r5, lr, r5, lsr #23
+ ldrh r5, [r2, r5]
+ orr r8, r12,r8, lsl #16
+ orr r12,r4, r5, lsl #16
+
+ stmia r0!,{r6,r7,r8,r12}
+ subs r3, r3, #1
+ bne convert2RGB555_loop
+
+ ldmfd sp!,{r4-r8,lr}
+ bx lr