@ vim:filetype=armasm\r
\r
-@ assembly "optimized" version of some funtions from draw.c\r
+@ ARM assembly versions of some funtions from draw.c\r
@ this is highly specialized, be careful if changing related C code!\r
\r
-@ (c) Copyright 2007, Grazvydas "notaz" Ignotas\r
+@ (c) Copyright 2007-2008, Grazvydas "notaz" Ignotas\r
@ All Rights Reserved\r
\r
.include "port_config.s"\r
.extern rendstatus\r
.extern DrawLineDest\r
.extern DrawStripInterlace\r
+.extern HighCacheS_ptr\r
\r
\r
@ helper\r
.endif\r
ldreqb r4, [r1,#\offs]\r
orrne r4, r3, r4\r
- strneb r4, [r1,#\offs]\r
- tsteq r4, #0x80\r
andeq r4, r4, #0x3f\r
- streqb r4, [r1,#\offs]\r
+ strb r4, [r1,#\offs]\r
.endm\r
\r
@ TileNormShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits\r
.else\r
ands r4, r12, r2\r
.endif\r
- beq 3f\r
+ beq 0f\r
cmp r4, #0xe\r
- beq 2f\r
- bgt 1f\r
- orr r4, r3, r4\r
- strb r4, [r1,#\ofs]\r
- b 3f\r
-1:\r
- ldrb r4, [r1,#\ofs] @ 2ci\r
- orr r4, r4, #0xc0\r
- strb r4, [r1,#\ofs]\r
- b 3f\r
-2:\r
- ldrb r4, [r1,#\ofs] @ 2ci\r
- bic r4, r4, #0xc0\r
- orr r4, r4, #0x80\r
+ ldrgeb r4, [r1,#\ofs]\r
+ orrlt r4, r3, r4 @ normal\r
+\r
+ biceq r4, r4, #0xc0 @ hilight\r
+ orreq r4, r4, #0x80\r
+ orrgt r4, r4, #0xc0 @ shadow\r
+\r
strb r4, [r1,#\ofs]\r
-3:\r
+0:\r
.endm\r
\r
@ TileFlipSh (r1=pdest, r2=pixels8, r3=pal) r4,r7: scratch, r0=sx, r12: register with helper pattern 0xf\r
TileDoShGenPixel 16, 7 @ #0x000f0000\r
.endm\r
\r
+.macro TileDoShGenPixel_noop shift ofs\r
+.if \shift\r
+ and r4, r12, r2, lsr #\shift\r
+.else\r
+ and r4, r12, r2\r
+.endif\r
+ sub r7, r4, #1\r
+ cmp r7, #0xd\r
+ orrcc r4, r3, r4 @ 0-0xc (was 1-0xd)\r
+ strccb r4, [r1,#\ofs]\r
+.endm\r
+\r
+.macro TileFlipSh_noop\r
+ TileDoShGenPixel_noop 16, 0 @ #0x000f0000\r
+ TileDoShGenPixel_noop 20, 1 @ #0x00f00000\r
+ TileDoShGenPixel_noop 24, 2 @ #0x0f000000\r
+ TileDoShGenPixel_noop 28, 3 @ #0xf0000000\r
+ TileDoShGenPixel_noop 0, 4 @ #0x0000000f\r
+ TileDoShGenPixel_noop 4, 5 @ #0x000000f0\r
+ TileDoShGenPixel_noop 8, 6 @ #0x00000f00\r
+ TileDoShGenPixel_noop 12, 7 @ #0x0000f000\r
+.endm\r
+\r
+.macro TileNormSh_noop\r
+ TileDoShGenPixel_noop 12, 0 @ #0x0000f000\r
+ TileDoShGenPixel_noop 8, 1 @ #0x00000f00\r
+ TileDoShGenPixel_noop 4, 2 @ #0x000000f0\r
+ TileDoShGenPixel_noop 0, 3 @ #0x0000000f\r
+ TileDoShGenPixel_noop 28, 4 @ #0xf0000000\r
+ TileDoShGenPixel_noop 24, 5 @ #0x0f000000\r
+ TileDoShGenPixel_noop 20, 6 @ #0x00f00000\r
+ TileDoShGenPixel_noop 16, 7 @ #0x000f0000\r
+.endm\r
+\r
+.macro TileDoShGenPixel_onlyop_lp shift ofs\r
+.if \shift\r
+ ands r7, r12, r2, lsr #\shift\r
+.else\r
+ ands r7, r12, r2\r
+.endif\r
+ ldrneb r4, [r1,#\ofs]\r
+ tstne r4, #0x40\r
+ beq 0f\r
+\r
+ cmp r7, #0xe\r
+ biceq r4, r4, #0xc0 @ hilight\r
+ orreq r4, r4, #0x80\r
+ orrgt r4, r4, #0xc0 @ shadow\r
+ strgeb r4, [r1,#\ofs]\r
+0:\r
+.endm\r
+\r
+.macro TileFlipSh_onlyop_lp\r
+ TileDoShGenPixel_onlyop_lp 16, 0 @ #0x000f0000\r
+ TileDoShGenPixel_onlyop_lp 20, 1 @ #0x00f00000\r
+ TileDoShGenPixel_onlyop_lp 24, 2 @ #0x0f000000\r
+ TileDoShGenPixel_onlyop_lp 28, 3 @ #0xf0000000\r
+ TileDoShGenPixel_onlyop_lp 0, 4 @ #0x0000000f\r
+ TileDoShGenPixel_onlyop_lp 4, 5 @ #0x000000f0\r
+ TileDoShGenPixel_onlyop_lp 8, 6 @ #0x00000f00\r
+ TileDoShGenPixel_onlyop_lp 12, 7 @ #0x0000f000\r
+.endm\r
+\r
+.macro TileNormSh_onlyop_lp\r
+ TileDoShGenPixel_onlyop_lp 12, 0 @ #0x0000f000\r
+ TileDoShGenPixel_onlyop_lp 8, 1 @ #0x00000f00\r
+ TileDoShGenPixel_onlyop_lp 4, 2 @ #0x000000f0\r
+ TileDoShGenPixel_onlyop_lp 0, 3 @ #0x0000000f\r
+ TileDoShGenPixel_onlyop_lp 28, 4 @ #0xf0000000\r
+ TileDoShGenPixel_onlyop_lp 24, 5 @ #0x0f000000\r
+ TileDoShGenPixel_onlyop_lp 20, 6 @ #0x00f00000\r
+ TileDoShGenPixel_onlyop_lp 16, 7 @ #0x000f0000\r
+.endm\r
+\r
\r
@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r
\r
b .dtfc_loop\r
\r
.dtfc_shadow_blank:\r
- ldrb r4, [r1] @ 1ci\r
- ldrb r12,[r1,#1]\r
- tst r4, #0x80\r
- andeq r4, r4,#0x3f\r
- streqb r4, [r1]\r
- tst r12,#0x80\r
- ldrb r4, [r1,#2]\r
- andeq r12,r12,#0x3f\r
- streqb r12,[r1,#1]\r
- tst r4, #0x80\r
- ldrb r12,[r1,#3]\r
- andeq r4, r4,#0x3f\r
- streqb r4, [r1,#2]\r
- tst r12,#0x80\r
- ldrb r4, [r1,#4]\r
- andeq r12,r12,#0x3f\r
- streqb r12,[r1,#3]\r
- tst r4, #0x80\r
- ldrb r12,[r1,#5]\r
- andeq r4, r4,#0x3f\r
- streqb r4, [r1,#4]\r
- tst r12,#0x80\r
- ldrb r4, [r1,#6]\r
- andeq r12,r12,#0x3f\r
- streqb r12,[r1,#5]\r
- tst r4, #0x80\r
- ldrb r12,[r1,#7]\r
- andeq r4, r4,#0x3f\r
- streqb r4, [r1,#6]\r
- tst r12,#0x80\r
- andeq r12,r12,#0x3f\r
- streqb r12,[r1,#7]\r
- mov r12, #0xf\r
+ tst r1, #1\r
+ ldrneb r4, [r1]\r
+ mov r6, #0x3f\r
+ and r4, r4, #0x3f\r
+ strneb r4, [r1], #1\r
+ ldrh r4, [r1]\r
+ orr r6, r6, r6, lsl #8\r
+ and r4, r4, r6\r
+ strh r4, [r1], #2\r
+ ldrh r4, [r1]\r
+ and r4, r4, r6\r
+ strh r4, [r1], #2\r
+ ldrh r4, [r1]\r
+ and r4, r4, r6\r
+ strh r4, [r1], #2\r
+ ldrh r4, [r1]\r
+ and r4, r4, r6\r
+ streqh r4, [r1]\r
+ strneb r4, [r1]\r
b .dtfc_loop\r
\r
.dtfc_cut_tile:\r
str r2, [r1]\r
\r
add r1, r11,#8\r
- mov r3, #320/4\r
- mov r7, #0x80\r
- orr r7, r7, r7, lsl #8\r
- orr r7, r7, r7, lsl #16\r
+ mov r3, #320/4/4\r
mov r6, #0x3f\r
orr r6, r6, r6, lsl #8\r
orr r6, r6, r6, lsl #16\r
.dtfc_loop_shprep:\r
+ ldmia r1, {r2,r4,r5,r7}\r
subs r3, r3, #1\r
- bmi .dtfc_loop @ done\r
- ldr r2, [r1]\r
- tst r2, r7\r
- andeq r2, r2, r6\r
- streq r2, [r1], #4\r
- beq .dtfc_loop_shprep\r
- tst r2, #0x80000000\r
- biceq r2, r2, #0xc0000000\r
- tst r2, #0x00800000\r
- biceq r2, r2, #0x00c00000\r
- tst r2, #0x00008000\r
- biceq r2, r2, #0x0000c000\r
- tst r2, #0x00000080\r
- biceq r2, r2, #0x000000c0\r
- str r2, [r1], #4\r
- b .dtfc_loop_shprep\r
+ and r2, r2, r6\r
+ and r4, r4, r6\r
+ and r5, r5, r6\r
+ and r7, r7, r6\r
+ stmia r1!,{r2,r4,r5,r7}\r
+ bne .dtfc_loop_shprep\r
+\r
+ mvn r5, #0 @ r5=prevcode=-1\r
+ b .dtfc_loop\r
\r
.pool\r
\r
b .dsfc_inloop\r
\r
.dsfc_shadow:\r
+ tst r9, #0x80000000\r
+ beq .dsfc_shadow_lowpri\r
+\r
cmp r2, r2, ror #4\r
beq .dsfc_singlec_sh\r
\r
TileSingleSh\r
b .dsfc_inloop\r
\r
+.dsfc_shadow_lowpri:\r
+ tst r9, #0x10000\r
+ bne .dsfc_TileFlip_sh_lp\r
+\r
+.dsfc_TileNorm_sh_lp:\r
+ TileNormSh_onlyop_lp\r
+ b .dsfc_inloop\r
+\r
+.dsfc_TileFlip_sh_lp:\r
+ TileFlipSh_onlyop_lp\r
+ b .dsfc_inloop\r
+\r
.pool\r
\r
@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r
@ + 0 : hhhhvvvv ab--hhvv yyyyyyyy yyyyyyyy // a: offscreen h, b: offs. v, h: horiz. size\r
@ + 4 : xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8\r
\r
-.global DrawSprite @ unsigned int *sprite, int **hc, int sh, int acc_sprites\r
+.global DrawSprite @ unsigned int *sprite, int sh, int acc_sprites\r
\r
DrawSprite:\r
stmfd sp!, {r4-r9,r11,lr}\r
\r
- orr r8, r3, r2, lsl #4\r
+ orr r8, r2, r1, lsl #4\r
ldr r3, [r0] @ sprite[0]\r
ldr r7, =Scanline\r
mov r6, r3, lsr #28\r
subne r4, r4, #1\r
subne r7, r4, r7 @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y\r
\r
- mov r8, r9, lsl #21\r
- mov r8, r8, lsr #21\r
- add r8, r8, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down\r
- \r
+ add r8, r9, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down\r
tst r9, #0x0800\r
mlane r8, r5, r6, r8 @ if (code&0x0800) { tile+=delta*(width-1);\r
rsbne r5, r5, #0 @ delta=-delta; } // r5=delta now\r
\r
- mov r8, r8, lsl #4\r
+ mov r8, r8, lsl #21\r
+ mov r8, r8, lsr #17\r
and r7, r7, #7\r
add r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address\r
\r
tst r9, #0x8000\r
- bne .dspr_cache @ if(code&0x8000) // high priority - cache it\r
+ tsteq r9, #(1<<27)\r
+ bne .dspr_cache @ if(code&0x8000) || as\r
+ tst r6, #0x4000\r
+ tstne r6, #0x2000\r
+ tstne r9, #(1<<31)\r
+ bne .dspr_cache @ (sh && pal == 0x30)\r
\r
.dspr_continue:\r
@ cache some stuff to avoid mem access\r
TileFlip r12\r
b .dspr_loop\r
\r
+.dspr_singlec_sh:\r
+ cmp r2, #0xe0000000\r
+ bcs .dspr_loop @ operator tileline, ignore\r
+\r
.dspr_SingleColor:\r
and r4, r2, #0xf\r
orr r4, r3, r4\r
\r
@ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
.dspr_TileNorm_sh:\r
- TileNormSh\r
+ TileNormSh_noop\r
b .dspr_loop\r
\r
.dspr_TileFlip_sh:\r
- TileFlipSh\r
- b .dspr_loop\r
-\r
-.dspr_singlec_sh:\r
- cmp r2, #0xe0000000\r
- bcc .dspr_SingleColor @ normal tileline\r
- tst r2, #0x10000000\r
- bne .dspr_sh_sh\r
- TileSingleHi\r
- b .dspr_loop\r
-\r
-.dspr_sh_sh:\r
- TileSingleSh\r
+ TileFlipSh_noop\r
b .dspr_loop\r
\r
\r
.dspr_cache:\r
- @ *(*hc)++ = (tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|((code>>9)&0x30)|((sprite[0]>>24)&0xf);\r
+ @ *HighCacheS_ptr++ = ((code&0x8000)<<16)|(tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|pal|((sprite[0]>>16)&0xf);\r
+ ldr r1, =HighCacheS_ptr\r
mov r4, r8, lsl #16 @ tile\r
tst r9, #0x0800\r
orrne r4, r4, #0x10000 @ code&0x0800\r
and r0, r9, #0x6000\r
orr r4, r4, r0, lsr #9 @ (code>>9)&0x30\r
mov r3, r3, lsl #12\r
- ldr r0, [r1]\r
orr r4, r4, r3, lsr #28 @ (sprite[0]>>24)&0xf\r
\r
+ ldr r0, [r1]\r
+ tst r9, #0x8000\r
+ orrne r4, r4, #0x80000000 @ prio\r
+\r
str r4, [r0], #4\r
str r0, [r1]\r
\r
- tst r9, #(1<<27)\r
- ldmeqfd sp!, {r4-r9,r11,lr}\r
- bne .dspr_continue @ draw anyway if accurate sprites enabled\r
- bxeq lr\r
+ and r0, r9, #(1<<27) @ as\r
+ teqne r0, #(1<<27) @ (code&0x8000) && !as\r
+ ldmnefd sp!, {r4-r9,r11,pc}\r
+ b .dspr_continue @ draw anyway if accurate sprites enabled\r
\r
@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r
\r
\r
ldr r6, =rendstatus\r
ldr lr, =(Pico+0x10000) @ lr=Pico.vram\r
- ldrb r6, [r6]\r
+ ldr r6, [r6]\r
\r
@ fetch the first code now\r
ldrh r7, [lr, r12]\r
\r
ands r6, r6, #2 @ we care about bit 1 only\r
orr r6, r6, r2\r
- bne .dw_no_sameprio\r
\r
- cmp r2, r7, lsr #15\r
- ldmnefd sp!, {r4-r11,pc} @ assume that whole window uses same priority\r
+ teqne r2, r7, lsr #15 @ do prio bits differ?\r
+ ldmnefd sp!, {r4-r11,pc} @ yes, assume that whole window uses same priority\r
\r
-.dw_no_sameprio:\r
orr r6, r6, r3, lsl #8 @ shadow mode\r
\r
sub r8, r1, r0\r
mov r8, r8, lsl #1 @ cells\r
mvn r9, #0 @ r9=prevcode=-1\r
.endif\r
- add r1, r11, r0, lsl #4 @ r1=pdest\r
+ add r1, r11, r0, lsl #4 @ r1=pdest\r
mov r0, #0xf\r
b .dwloop_enter\r
\r
- @ r4,r5 & r7 are scratch in this loop\r
+ @ r4,r5 are scratch in this loop\r
.dwloop:\r
add r1, r1, #8\r
.dwloop_nor1:\r
orreq r3, r3, #0x40\r
beq .dw_shadow_done\r
ldr r4, [r1]\r
- tst r4, #0x00000080\r
- biceq r4, r4, #0x000000c0\r
- tst r4, #0x00008000\r
- biceq r4, r4, #0x0000c000\r
- tst r4, #0x00800000\r
- biceq r4, r4, #0x00c00000\r
- tst r4, #0x80000000\r
- biceq r4, r4, #0xc0000000\r
+ mov r5, #0x3f\r
+ orr r5, r5, r5, lsl #8\r
+ orr r5, r5, r5, lsl #16\r
+ and r4, r4, r5\r
str r4, [r1]\r
ldr r4, [r1,#4]\r
- tst r4, #0x00000080\r
- biceq r4, r4, #0x000000c0\r
- tst r4, #0x00008000\r
- biceq r4, r4, #0x0000c000\r
- tst r4, #0x00800000\r
- biceq r4, r4, #0x00c00000\r
- tst r4, #0x80000000\r
- biceq r4, r4, #0xc0000000\r
+ and r4, r4, r5\r
str r4, [r1,#4]\r
b .dw_shadow_done\r
\r