X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=drivers%2Fgp2x%2Fasmutils.s;h=a91cfaf2b817d7a4308d715547a48ad4ee21439a;hb=21afaa365c97896da6ccbdd6540e20f2d2bb4990;hp=917bb0892f9771a004222b3bc5cff090338f288a;hpb=c0bf6f9f02a2b6afb961a7e9195e2168d7e9cecf;p=fceu.git diff --git a/drivers/gp2x/asmutils.s b/drivers/gp2x/asmutils.s index 917bb08..a91cfaf 100644 --- a/drivers/gp2x/asmutils.s +++ b/drivers/gp2x/asmutils.s @@ -8,6 +8,196 @@ flushcache: mov pc, lr +.global block_or @ void *src, size_t n, int pat + +block_or: + stmfd sp!, {r4-r5} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + mov r1, r1, lsr #4 +block_loop_or: + ldmia r0, {r3-r5,r12} + subs r1, r1, #1 + orr r3, r3, r2 + orr r4, r4, r2 + orr r5, r5, r2 + orr r12,r12,r2 + stmia r0!, {r3-r5,r12} + bne block_loop_or + ldmfd sp!, {r4-r5} + bx lr + + +.global block_and @ void *src, size_t n, int andpat + +block_and: + stmfd sp!, {r4-r5} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + mov r1, r1, lsr #4 +block_loop_and: + ldmia r0, {r3-r5,r12} + subs r1, r1, #1 + and r3, r3, r2 + and r4, r4, r2 + and r5, r5, r2 + and r12,r12,r2 + stmia r0!, {r3-r5,r12} + bne block_loop_and + ldmfd sp!, {r4-r5} + bx lr + + +.global block_andor @ void *src, size_t n, int andpat, int orpat + +block_andor: + stmfd sp!, {r4-r6} + orr r2, r2, r2, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #8 + orr r3, r3, r3, lsl #16 + mov r1, r1, lsr #4 +block_loop_andor: + ldmia r0, {r4-r6,r12} + subs r1, r1, #1 + and r4, r4, r2 + orr r4, r4, r3 + and r5, r5, r2 + orr r5, r5, r3 + and r6, r6, r2 + orr r6, r6, r3 + and r12,r12,r2 + orr r12,r12,r3 + stmia r0!, {r4-r6,r12} + bne block_loop_andor + ldmfd sp!, {r4-r6} + bx lr + + +.global spend_cycles @ c + +spend_cycles: + mov r0, r0, lsr #2 @ 4 cycles/iteration + sub r0, r0, #2 @ entry/exit/init +.sc_loop: + subs r0, r0, #1 + bpl .sc_loop + + bx lr + + +.global memset32 @ int *dest, int c, int count + +memset32: + stmfd sp!, {lr} + + mov r3, r1 + subs r2, r2, #4 + bmi mst32_fin + + mov r12,r1 + mov lr, r1 + +mst32_loop: + subs r2, r2, #4 + stmia r0!, {r1,r3,r12,lr} + bpl mst32_loop + +mst32_fin: + tst r2, #1 + strne r1, [r0], #4 + + tst r2, #2 + stmneia r0!, {r1,r3} + + ldmfd sp!, {lr} + bx lr + + + +.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines + +soft_scale: + stmfd sp!,{r4-r11,lr} + mov lr, #0xff + mov lr, lr, lsl #1 + mov r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007 + orr r9, r9, #0x00e7 + + mov r11,r3 @ r11= line counter + mov r3, r1 @ r3 = pal base + + mov r12,#320 + mul r2, r12,r2 + add r4, r0, r2, lsl #1 @ r4 = dst_start + add r5, r0, r2 @ r5 = src_start + mul r12,r11,r12 + add r0, r4, r12,lsl #1 @ r0 = dst_end + add r1, r5, r12 @ r1 = src_end + +soft_scale_loop: + sub r1, r1, #64 @ skip borders + mov r2, #256/8 + +soft_scale_loop_line: + ldr r12, [r1, #-8]! + ldr r7, [r1, #4] + + and r4, lr, r12,lsl #1 + ldrh r4, [r3, r4] + and r5, lr, r12,lsr #7 + ldrh r5, [r3, r5] + and r4, r4, r9, lsl #2 + orr r4, r4, r4, lsl #14 @ r4[31:16] = 1/4 pix_s 0 + and r5, r5, r9, lsl #2 + sub r6, r5, r5, lsr #2 @ r6 = 3/4 pix_s 1 + add r4, r4, r6, lsl #16 @ pix_d 0, 1 + and r6, lr, r12,lsr #15 + ldrh r6, [r3, r6] + and r12,lr, r12,lsr #23 + ldrh r12,[r3, r12] + and r6, r6, r9, lsl #2 + add r5, r5, r6 + mov r5, r5, lsr #1 + sub r6, r6, r6, lsr #2 @ r6 = 3/4 pix_s 2 + orr r5, r5, r6, lsl #16 + + and r6, lr, r7, lsl #1 + ldrh r6, [r3, r6] + and r12,r12,r9, lsl #2 + add r5, r5, r12,lsl #14 @ pix_d 2, 3 + and r6, r6, r9, lsl #2 + orr r6, r12,r6, lsl #16 @ pix_d 4, 5 + + and r12,lr, r7, lsr #7 + ldrh r12,[r3, r12] + and r10,lr, r7, lsr #15 + ldrh r10,[r3, r10] + and r12,r12,r9, lsl #2 + sub r8, r12,r12,lsr #2 @ r8 = 3/4 pix_s 1 + add r8, r8, r6, lsr #18 + and r7, lr, r7, lsr #23 + ldrh r7, [r3, r7] + and r10,r10,r9, lsl #2 + orr r8, r8, r10,lsl #15 + add r8, r8, r12,lsl #15 @ pix_d 6, 7 + sub r10,r10,r10,lsr #2 @ r10= 3/4 pix_s 2 + and r7, r7, r9, lsl #2 + add r10,r10,r7, lsr #2 @ += 1/4 pix_s 3 + orr r10,r10,r7, lsl #16 @ pix_d 8, 9 + + subs r2, r2, #1 + + stmdb r0!, {r4,r5,r6,r8,r10} + bne soft_scale_loop_line + + subs r11,r11,#1 + bne soft_scale_loop + + ldmfd sp!,{r4-r11,lr} + bx lr + + /* buggy and slow, probably because function call overhead @ renderer helper, based on bitbank's method .global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal