X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?p=fceu.git;a=blobdiff_plain;f=drivers%2Fgp2x%2Fasmutils.s;h=9df1b2c7a4f382626d453a40d9f61aa1a94cbce2;hp=3db4e303e352270eec9b3a052ea44fd8ec3d6d97;hb=7b356ee3dc5d7e54d9dc06c413f84380d1044441;hpb=b2b95d2e0d1fd5e52d03c2152605b09b024c1d0e diff --git a/drivers/gp2x/asmutils.s b/drivers/gp2x/asmutils.s index 3db4e30..9df1b2c 100644 --- a/drivers/gp2x/asmutils.s +++ b/drivers/gp2x/asmutils.s @@ -1,5 +1,8 @@ @ vim:filetype=armasm +@ Assembly optimized routines for gpfce - FCE Ultra port +@ (c) Copyright 2007, Grazvydas "notaz" Ignotas + @ test .global flushcache @ beginning_addr, end_addr, flags @@ -28,6 +31,26 @@ block_loop_or: bx lr +.global block_and @ void *src, size_t n, int andpat + +block_and: + stmfd sp!, {r4-r5} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + mov r1, r1, lsr #4 +block_loop_and: + ldmia r0, {r3-r5,r12} + subs r1, r1, #1 + and r3, r3, r2 + and r4, r4, r2 + and r5, r5, r2 + and r12,r12,r2 + stmia r0!, {r3-r5,r12} + bne block_loop_and + ldmfd sp!, {r4-r5} + bx lr + + .global block_andor @ void *src, size_t n, int andpat, int orpat block_andor: @@ -66,60 +89,173 @@ spend_cycles: bx lr +.global memset32 @ int *dest, int c, int count + +memset32: + stmfd sp!, {lr} + + mov r3, r1 + subs r2, r2, #4 + bmi mst32_fin + + mov r12,r1 + mov lr, r1 + +mst32_loop: + subs r2, r2, #4 + stmia r0!, {r1,r3,r12,lr} + bpl mst32_loop + +mst32_fin: + tst r2, #1 + strne r1, [r0], #4 + + tst r2, #2 + stmneia r0!, {r1,r3} + + ldmfd sp!, {lr} + bx lr + + +@ warning: this code relies on palette being strictly RGB555, i.e. bit5=0 +.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines + +soft_scale: + stmfd sp!,{r4-r11,lr} + mov lr, #0xff + mov lr, lr, lsl #1 + mov r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007 + orr r9, r9, #0x00e7 + + mov r11,r3 @ r11= line counter + mov r3, r1 @ r3 = pal base + + mov r12,#320 + mul r2, r12,r2 + add r4, r0, r2, lsl #1 @ r4 = dst_start + add r5, r0, r2 @ r5 = src_start + mul r12,r11,r12 + add r0, r4, r12,lsl #1 @ r0 = dst_end + add r1, r5, r12 @ r1 = src_end + + mov r2, r11 + +soft_scale_loop: + sub r1, r1, #64 @ skip borders + orr r2, r2, #(256/8-1)<<24 + +soft_scale_loop_line: + ldr r12, [r1, #-8]! + ldr r7, [r1, #4] + + and r4, lr, r12,lsl #1 + ldrh r4, [r3, r4] + and r5, lr, r12,lsr #7 + ldrh r5, [r3, r5] + and r11,r4, r9, lsl #2 + orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0 + and r11,r5, r9, lsl #2 + sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1 + add r4, r4, r6, lsl #16 @ pix_d 0, 1 + and r6, lr, r12,lsr #15 + ldrh r6, [r3, r6] + and r12,lr, r12,lsr #23 + ldrh r12,[r3, r12] + + mov r11,r6, ror #11 + adds r5, r11,r5, ror #11 + mov r5, r5, ror #22 + bic r5, r5, #0xff000000 + bic r5, r5, #0x0420 @ set the green bits as they should be + orrcs r5, r5, #0x0400 + + and r11,r6, r9, lsl #2 + sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2 + orr r5, r5, r6, lsl #16 + + and r6, lr, r7, lsl #1 + ldrh r6, [r3, r6] + and r11,r12,r9, lsl #2 + add r5, r5, r11,lsl #14 @ pix_d 2, 3 + orr r6, r12,r6, lsl #16 @ pix_d 4, 5 + + and r12,lr, r7, lsr #7 + ldrh r12,[r3, r12] + and r10,lr, r7, lsr #15 + ldrh r10,[r3, r10] + and r11,r12,r9, lsl #2 + sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1 + and r11,r6, r9, lsl #18 + add r8, r8, r11,lsr #18 + and r7, lr, r7, lsr #23 + ldrh r7, [r3, r7] + + mov r11,r10,ror #11 + adds r12,r11,r12,ror #11 + mov r12,r12,ror #22 + bic r12,r12,#0x0420 + orrcs r12,r12,#0x0400 + orr r8, r8, r12,lsl #16 @ pix_d 6, 7 + + and r11,r10,r9, lsl #2 + sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2 + and r11,r7, r9, lsl #2 + add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3 + orr r10,r10,r7, lsl #16 @ pix_d 8, 9 + + subs r2, r2, #1<<24 + + stmdb r0!, {r4,r5,r6,r8,r10} + bpl soft_scale_loop_line + + add r2, r2, #1<<24 + subs r2, r2, #1 + bne soft_scale_loop + + ldmfd sp!,{r4-r11,lr} + bx lr + + +@ void convert2RGB555(unsigned short *dst, unsigned char *src, unsigned short *pal, int count); + +.global convert2RGB555 + +convert2RGB555: + stmfd sp!,{r4-r8,lr} + mov lr, #0xff + mov lr, lr, lsl #1 -/* buggy and slow, probably because function call overhead -@ renderer helper, based on bitbank's method -.global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal - -draw8pix: - stmfd sp!, {r4,r5} - - ldrb r3, [r1] @ get bit 0 pixels - mov r12,#1 - orr r12,r12,r12,lsl #8 - orr r12,r12,r12,lsl #16 - ldrb r1, [r1, #8] @ get bit 1 pixels - orr r3, r3, r3, lsl #9 @ shift them over 1 byte + 1 bit - orr r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes - and r4, r12,r3, lsr #7 @ mask off the upper nibble pixels we want - and r5, r12,r3, lsr #3 @ mask off the lower nibble pixels we want - ldr r2, [r2] - - orr r1, r1, r1, lsl #9 @ process the bit 1 pixels - orr r1, r1, r1, lsl #18 - and r3, r12,r1, lsr #7 @ mask off the upper nibble pixels we want - and r1, r12,r1, lsr #3 @ mask off the lower nibble - orr r4, r4, r3, lsl #1 - orr r5, r5, r1, lsl #5 - - @ can this be avoided? - mov r4, r4, lsl #3 @ *8 - mov r3, r2, ror r4 - strb r3, [r0], #1 - mov r4, r4, lsr #8 - mov r3, r2, ror r4 - strb r3, [r0], #1 - mov r4, r4, lsr #8 - mov r3, r2, ror r4 - strb r3, [r0], #1 - mov r4, r4, lsr #8 - mov r3, r2, ror r4 - strb r3, [r0], #1 - - mov r5, r5, lsl #3 @ *8 - mov r3, r2, ror r5 - strb r3, [r0], #1 - mov r5, r5, lsr #8 - mov r3, r2, ror r5 - strb r3, [r0], #1 - mov r5, r5, lsr #8 - mov r3, r2, ror r5 - strb r3, [r0], #1 - mov r5, r5, lsr #8 - mov r3, r2, ror r5 - strb r3, [r0], #1 - - ldmfd sp!, {r4,r5} - bx lr -*/ + mov r3, r3, lsr #3 + +convert2RGB555_loop: + ldmia r1!,{r4,r5} + + and r6, lr, r4, lsl #1 + ldrh r6, [r2, r6] + and r7, lr, r4, lsr #7 + ldrh r7, [r2, r7] + and r8, lr, r4, lsr #15 + ldrh r8, [r2, r8] + and r4, lr, r4, lsr #23 + ldrh r4, [r2, r4] + + orr r6, r6, r7, lsl #16 + and r12,lr, r5, lsl #1 + ldrh r12, [r2, r12] + orr r7, r8, r4, lsl #16 + and r8, lr, r5, lsr #7 + ldrh r8, [r2, r8] + and r4, lr, r5, lsr #15 + ldrh r4, [r2, r4] + and r5, lr, r5, lsr #23 + ldrh r5, [r2, r5] + orr r8, r12,r8, lsl #16 + orr r12,r4, r5, lsl #16 + + stmia r0!,{r6,r7,r8,r12} + subs r3, r3, #1 + bne convert2RGB555_loop + + ldmfd sp!,{r4-r8,lr} + bx lr