X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=fceu.git;a=blobdiff_plain;f=drivers%2Fgp2x%2Fasmutils.s;h=9df1b2c7a4f382626d453a40d9f61aa1a94cbce2;hp=c87a3ab3327da3244f5b4a97c856618531100513;hb=7b356ee3dc5d7e54d9dc06c413f84380d1044441;hpb=937bf65b1c80e9394547e5f105664bd26f3671de diff --git a/drivers/gp2x/asmutils.s b/drivers/gp2x/asmutils.s index c87a3ab..9df1b2c 100644 --- a/drivers/gp2x/asmutils.s +++ b/drivers/gp2x/asmutils.s @@ -1,5 +1,8 @@ @ vim:filetype=armasm +@ Assembly optimized routines for gpfce - FCE Ultra port +@ (c) Copyright 2007, Grazvydas "notaz" Ignotas + @ test .global flushcache @ beginning_addr, end_addr, flags @@ -7,3 +10,252 @@ flushcache: swi #0x9f0002 mov pc, lr + +.global block_or @ void *src, size_t n, int pat + +block_or: + stmfd sp!, {r4-r5} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + mov r1, r1, lsr #4 +block_loop_or: + ldmia r0, {r3-r5,r12} + subs r1, r1, #1 + orr r3, r3, r2 + orr r4, r4, r2 + orr r5, r5, r2 + orr r12,r12,r2 + stmia r0!, {r3-r5,r12} + bne block_loop_or + ldmfd sp!, {r4-r5} + bx lr + + +.global block_and @ void *src, size_t n, int andpat + +block_and: + stmfd sp!, {r4-r5} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + mov r1, r1, lsr #4 +block_loop_and: + ldmia r0, {r3-r5,r12} + subs r1, r1, #1 + and r3, r3, r2 + and r4, r4, r2 + and r5, r5, r2 + and r12,r12,r2 + stmia r0!, {r3-r5,r12} + bne block_loop_and + ldmfd sp!, {r4-r5} + bx lr + + +.global block_andor @ void *src, size_t n, int andpat, int orpat + +block_andor: + stmfd sp!, {r4-r6} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #8 + orr r3, r3, r3, lsl #16 + mov r1, r1, lsr #4 +block_loop_andor: + ldmia r0, {r4-r6,r12} + subs r1, r1, #1 + and r4, r4, r2 + orr r4, r4, r3 + and r5, r5, r2 + orr r5, r5, r3 + and r6, r6, r2 + orr r6, r6, r3 + and r12,r12,r2 + orr r12,r12,r3 + stmia r0!, {r4-r6,r12} + bne block_loop_andor + ldmfd sp!, {r4-r6} + bx lr + + +.global spend_cycles @ c + +spend_cycles: + mov r0, r0, lsr #2 @ 4 cycles/iteration + sub r0, r0, #2 @ entry/exit/init +.sc_loop: + subs r0, r0, #1 + bpl .sc_loop + + bx lr + + +.global memset32 @ int *dest, int c, int count + +memset32: + stmfd sp!, {lr} + + mov r3, r1 + subs r2, r2, #4 + bmi mst32_fin + + mov r12,r1 + mov lr, r1 + +mst32_loop: + subs r2, r2, #4 + stmia r0!, {r1,r3,r12,lr} + bpl mst32_loop + +mst32_fin: + tst r2, #1 + strne r1, [r0], #4 + + tst r2, #2 + stmneia r0!, {r1,r3} + + ldmfd sp!, {lr} + bx lr + + +@ warning: this code relies on palette being strictly RGB555, i.e. bit5=0 +.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines + +soft_scale: + stmfd sp!,{r4-r11,lr} + mov lr, #0xff + mov lr, lr, lsl #1 + mov r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007 + orr r9, r9, #0x00e7 + + mov r11,r3 @ r11= line counter + mov r3, r1 @ r3 = pal base + + mov r12,#320 + mul r2, r12,r2 + add r4, r0, r2, lsl #1 @ r4 = dst_start + add r5, r0, r2 @ r5 = src_start + mul r12,r11,r12 + add r0, r4, r12,lsl #1 @ r0 = dst_end + add r1, r5, r12 @ r1 = src_end + + mov r2, r11 + +soft_scale_loop: + sub r1, r1, #64 @ skip borders + orr r2, r2, #(256/8-1)<<24 + +soft_scale_loop_line: + ldr r12, [r1, #-8]! + ldr r7, [r1, #4] + + and r4, lr, r12,lsl #1 + ldrh r4, [r3, r4] + and r5, lr, r12,lsr #7 + ldrh r5, [r3, r5] + and r11,r4, r9, lsl #2 + orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0 + and r11,r5, r9, lsl #2 + sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1 + add r4, r4, r6, lsl #16 @ pix_d 0, 1 + and r6, lr, r12,lsr #15 + ldrh r6, [r3, r6] + and r12,lr, r12,lsr #23 + ldrh r12,[r3, r12] + + mov r11,r6, ror #11 + adds r5, r11,r5, ror #11 + mov r5, r5, ror #22 + bic r5, r5, #0xff000000 + bic r5, r5, #0x0420 @ set the green bits as they should be + orrcs r5, r5, #0x0400 + + and r11,r6, r9, lsl #2 + sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2 + orr r5, r5, r6, lsl #16 + + and r6, lr, r7, lsl #1 + ldrh r6, [r3, r6] + and r11,r12,r9, lsl #2 + add r5, r5, r11,lsl #14 @ pix_d 2, 3 + orr r6, r12,r6, lsl #16 @ pix_d 4, 5 + + and r12,lr, r7, lsr #7 + ldrh r12,[r3, r12] + and r10,lr, r7, lsr #15 + ldrh r10,[r3, r10] + and r11,r12,r9, lsl #2 + sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1 + and r11,r6, r9, lsl #18 + add r8, r8, r11,lsr #18 + and r7, lr, r7, lsr #23 + ldrh r7, [r3, r7] + + mov r11,r10,ror #11 + adds r12,r11,r12,ror #11 + mov r12,r12,ror #22 + bic r12,r12,#0x0420 + orrcs r12,r12,#0x0400 + orr r8, r8, r12,lsl #16 @ pix_d 6, 7 + + and r11,r10,r9, lsl #2 + sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2 + and r11,r7, r9, lsl #2 + add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3 + orr r10,r10,r7, lsl #16 @ pix_d 8, 9 + + subs r2, r2, #1<<24 + + stmdb r0!, {r4,r5,r6,r8,r10} + bpl soft_scale_loop_line + + add r2, r2, #1<<24 + subs r2, r2, #1 + bne soft_scale_loop + + ldmfd sp!,{r4-r11,lr} + bx lr + + +@ void convert2RGB555(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); + +.global convert2RGB555 + +convert2RGB555: + stmfd sp!,{r4-r8,lr} + mov lr, #0xff + mov lr, lr, lsl #1 + + mov r3, r3, lsr #3 + +convert2RGB555_loop: + ldmia r1!,{r4,r5} + + and r6, lr, r4, lsl #1 + ldrh r6, [r2, r6] + and r7, lr, r4, lsr #7 + ldrh r7, [r2, r7] + and r8, lr, r4, lsr #15 + ldrh r8, [r2, r8] + and r4, lr, r4, lsr #23 + ldrh r4, [r2, r4] + + orr r6, r6, r7, lsl #16 + and r12,lr, r5, lsl #1 + ldrh r12, [r2, r12] + orr r7, r8, r4, lsl #16 + and r8, lr, r5, lsr #7 + ldrh r8, [r2, r8] + and r4, lr, r5, lsr #15 + ldrh r4, [r2, r4] + and r5, lr, r5, lsr #23 + ldrh r5, [r2, r5] + orr r8, r12,r8, lsl #16 + orr r12,r4, r5, lsl #16 + + stmia r0!,{r6,r7,r8,r12} + subs r3, r3, #1 + bne convert2RGB555_loop + + ldmfd sp!,{r4-r8,lr} + bx lr +