X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=Pico%2FDraw.s;h=f31025362dfd6b5e721d6e2ec2c035959acb48f6;hb=d95259bdaaf911218656d8a74b096ff7306034f6;hp=e31ba0baa6911ad14782bf7823d97d4b6051c6af;hpb=83c093a48ab58670ea82d0ec81658daa9f9b950a;p=picodrive.git

diff --git a/Pico/Draw.s b/Pico/Draw.s
index e31ba0ba..f3102536 100644
--- a/Pico/Draw.s
+++ b/Pico/Draw.s
@@ -1,11 +1,12 @@
 @ vim:filetype=armasm
 
-@ assembly "optimized" version of some funtions from draw.c
+@ ARM assembly versions of some funtions from draw.c
 @ this is highly specialized, be careful if changing related C code!
 
-@ (c) Copyright 2007, Grazvydas "notaz" Ignotas
+@ (c) Copyright 2007-2008, Grazvydas "notaz" Ignotas
 @ All Rights Reserved
 
+.include "port_config.s"
 
 .extern Pico
 .extern PicoOpt
@@ -15,6 +16,7 @@
 .extern rendstatus
 .extern DrawLineDest
 .extern DrawStripInterlace
+.extern HighCacheS_ptr
 
 
 @ helper
@@ -63,10 +65,8 @@
 .endif
     ldreqb  r4, [r1,#\offs]
     orrne   r4, r3, r4
-    strneb  r4, [r1,#\offs]
-    tsteq   r4, #0x80
     andeq   r4, r4, #0x3f
-    streqb  r4, [r1,#\offs]
+    strb    r4, [r1,#\offs]
 .endm
 
 @ TileNormShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits
@@ -154,24 +154,17 @@
 .else
     ands    r4, r12, r2
 .endif
-    beq     3f
+    beq     0f
     cmp     r4, #0xe
-    beq     2f
-    bgt     1f
-    orr     r4, r3, r4
-    strb    r4, [r1,#\ofs]
-    b       3f
-1:
-    ldrb    r4, [r1,#\ofs]        @ 2ci
-    orr     r4, r4, #0xc0
-    strb    r4, [r1,#\ofs]
-    b       3f
-2:
-    ldrb    r4, [r1,#\ofs]        @ 2ci
-    bic     r4, r4, #0xc0
-    orr     r4, r4, #0x80
+    ldrgeb  r4, [r1,#\ofs]
+    orrlt   r4, r3, r4            @ normal
+
+    biceq   r4, r4, #0xc0         @ hilight
+    orreq   r4, r4, #0x80
+    orrgt   r4, r4, #0xc0         @ shadow
+
     strb    r4, [r1,#\ofs]
-3:
+0:
 .endm
 
 @ TileFlipSh (r1=pdest, r2=pixels8, r3=pal) r4,r7: scratch, r0=sx, r12: register with helper pattern 0xf
@@ -198,6 +191,80 @@
     TileDoShGenPixel 16,  7 @ #0x000f0000
 .endm
 
+.macro TileDoShGenPixel_noop shift ofs
+.if \shift
+    and     r4, r12, r2, lsr #\shift
+.else
+    and     r4, r12, r2
+.endif
+    sub     r7, r4, #1
+    cmp     r7, #0xd
+    orrcc   r4, r3, r4           @ 0-0xc (was 1-0xd)
+    strccb  r4, [r1,#\ofs]
+.endm
+
+.macro TileFlipSh_noop
+    TileDoShGenPixel_noop 16,  0 @ #0x000f0000
+    TileDoShGenPixel_noop 20,  1 @ #0x00f00000
+    TileDoShGenPixel_noop 24,  2 @ #0x0f000000
+    TileDoShGenPixel_noop 28,  3 @ #0xf0000000
+    TileDoShGenPixel_noop  0,  4 @ #0x0000000f
+    TileDoShGenPixel_noop  4,  5 @ #0x000000f0
+    TileDoShGenPixel_noop  8,  6 @ #0x00000f00
+    TileDoShGenPixel_noop 12,  7 @ #0x0000f000
+.endm
+
+.macro TileNormSh_noop
+    TileDoShGenPixel_noop 12,  0 @ #0x0000f000
+    TileDoShGenPixel_noop  8,  1 @ #0x00000f00
+    TileDoShGenPixel_noop  4,  2 @ #0x000000f0
+    TileDoShGenPixel_noop  0,  3 @ #0x0000000f
+    TileDoShGenPixel_noop 28,  4 @ #0xf0000000
+    TileDoShGenPixel_noop 24,  5 @ #0x0f000000
+    TileDoShGenPixel_noop 20,  6 @ #0x00f00000
+    TileDoShGenPixel_noop 16,  7 @ #0x000f0000
+.endm
+
+.macro TileDoShGenPixel_onlyop_lp shift ofs
+.if \shift
+    ands    r7, r12, r2, lsr #\shift
+.else
+    ands    r7, r12, r2
+.endif
+    ldrneb  r4, [r1,#\ofs]
+    tstne   r4, #0x40
+    beq     0f
+
+    cmp     r7, #0xe
+    biceq   r4, r4, #0xc0         @ hilight
+    orreq   r4, r4, #0x80
+    orrgt   r4, r4, #0xc0         @ shadow
+    strgeb  r4, [r1,#\ofs]
+0:
+.endm
+
+.macro TileFlipSh_onlyop_lp
+    TileDoShGenPixel_onlyop_lp 16,  0 @ #0x000f0000
+    TileDoShGenPixel_onlyop_lp 20,  1 @ #0x00f00000
+    TileDoShGenPixel_onlyop_lp 24,  2 @ #0x0f000000
+    TileDoShGenPixel_onlyop_lp 28,  3 @ #0xf0000000
+    TileDoShGenPixel_onlyop_lp  0,  4 @ #0x0000000f
+    TileDoShGenPixel_onlyop_lp  4,  5 @ #0x000000f0
+    TileDoShGenPixel_onlyop_lp  8,  6 @ #0x00000f00
+    TileDoShGenPixel_onlyop_lp 12,  7 @ #0x0000f000
+.endm
+
+.macro TileNormSh_onlyop_lp
+    TileDoShGenPixel_onlyop_lp 12,  0 @ #0x0000f000
+    TileDoShGenPixel_onlyop_lp  8,  1 @ #0x00000f00
+    TileDoShGenPixel_onlyop_lp  4,  2 @ #0x000000f0
+    TileDoShGenPixel_onlyop_lp  0,  3 @ #0x0000000f
+    TileDoShGenPixel_onlyop_lp 28,  4 @ #0xf0000000
+    TileDoShGenPixel_onlyop_lp 24,  5 @ #0x0f000000
+    TileDoShGenPixel_onlyop_lp 20,  6 @ #0x00f00000
+    TileDoShGenPixel_onlyop_lp 16,  7 @ #0x000f0000
+.endm
+
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
@@ -317,11 +384,18 @@ DrawLayer:
     add     r8, r8, r9, lsr #8   @ tilex+=cellskip
     add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;
     sub     r10,r10,r9, lsl #16  @ cells-=cellskip
-    mvn     r9, #0               @ r9=prevcode=-1
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r0, #0xf
+    ldr     r11,[r11]
+.else
     ldr     r11,=HighCol
     mov     r0, #0xf
+.endif
+
+    mvn     r9, #0               @ r9=prevcode=-1
     add     r1, r11, r7         @ r1=pdest
 
 
@@ -436,8 +510,8 @@ DrawLayer:
 .DrawStrip_vsscroll:
     rsb     r8, r3, #0
     mov     r8, r8, lsr #3        @ r8=tilex=(-ts->hscroll)>>3
-    bic     r8, r8, #0xff000000
-    orr     r8, r8, r5, lsl #25   @ r8=(xmask[31:25]|had_output[24]|tilex[23:0])
+    bic     r8, r8, #0x3fc00000
+    orr     r8, r8, r5, lsl #25   @ r8=(xmask[31:25]|had_output[24]|tilex[21:0])
 
     ldr     r4, =Scanline
     orr     r5, r1, r10, lsl #24
@@ -463,13 +537,19 @@ DrawLayer:
     add     r8, r8, r9, lsr #8   @ tilex+=cellskip
     add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;
     add     r10,r10,r9, lsl #16  @ cell+=cellskip
-    mvn     r9, #0               @ r9=prevcode=-1
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
     ldr     r11,=HighCol
     mov     r0, #0xf
-    add     r1, r11, r7         @ r1=pdest
+    ldr     r11,[r11]
+.else
+    ldr     r11,=HighCol
+    mov     r0, #0xf
+.endif
 
+    mvn     r9, #0               @ r9=prevcode=-1
+    add     r1, r11, r7          @ r1=pdest
 
     @ r4 & r7 are scratch in this loop
 .dsloop_vs_subr1:
@@ -490,9 +570,9 @@ DrawLayer:
     ldrh    r7, [r7]              @ r7=vscroll
 
     bic     r10,r10,#0xff         @ clear old ty
-    and     r4, r5, #0xff0000
-    add     r4, r4, r7, lsl #16
-    and     r4, r4, r5, lsl #16   @ r4=line<<16
+    and     r4, r5, #0xff0000     @ scanline
+    add     r4, r4, r7, lsl #16   @ ... += vscroll
+    and     r4, r4, r5, lsl #16   @ ... &= ymask
     and     r7, r4, #0x70000
     orr     r10,r10,r7, lsr #15   @ new ty
 
@@ -515,7 +595,7 @@ DrawLayer:
     beq     .DrawStrip_vs_samecode @ we know stuff about this tile already
 
     mov     r9, r7          @ remember code
-    orr     r8, r8, #1<<24  @ seen non hi-prio tile
+    orr     r8, r8, #(1<<24)@ seen non hi-prio tile
 
     movs    r2, r9, lsl #20 @ if (code&0x1000)
     mov     r2, r2, lsl #1
@@ -589,7 +669,7 @@ DrawLayer:
     b       .dsloop_vs
 
 .dsloop_vs_exit:
-    tst     r8, #1<<24 @ seen non hi-prio tile
+    tst     r8, #(1<<24) @ seen non hi-prio tile
     ldreq   r1, =rendstatus
     mov     r0, #0
     ldreq   r2, [r1]
@@ -639,10 +719,18 @@ DrawLayer:
 BackFill:
     stmfd   sp!, {r4-r9,lr}
 
+.if OVERRIDE_HIGHCOL
+    ldr     lr, =HighCol
+    mov     r0, r0, lsl #26
+    ldr     lr, [lr]
+    mov     r0, r0, lsr #26
+    add     lr, lr, #8
+.else
     ldr     lr, =(HighCol+8)
-
     mov     r0, r0, lsl #26
     mov     r0, r0, lsr #26
+.endif
+
     orr     r0, r0, r1, lsl #6
     orr     r0, r0, r0, lsl #8
     orr     r0, r0, r0, lsl #16
@@ -680,9 +768,15 @@ DrawTilesFromCache:
     stmfd   sp!, {r4-r8,r11,lr}
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
     ldr     r11,=HighCol
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
     mov     r12,#0xf
+    ldr     r11,[r11]
+.else
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+.endif
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
 
     mvn     r5, #0         @ r5=prevcode=-1
     ands    r8, r1, #1
@@ -770,39 +864,25 @@ DrawTilesFromCache:
     b       .dtfc_loop
 
 .dtfc_shadow_blank:
-    ldrb    r4, [r1]        @ 1ci
-    ldrb    r12,[r1,#1]
-    tst     r4, #0x80
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#2]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#1]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#3]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#2]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#4]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#3]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#5]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#4]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#6]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#5]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#7]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#6]
-    tst     r12,#0x80
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#7]
-    mov     r12, #0xf
+    tst     r1, #1
+    ldrneb  r4, [r1]
+    mov     r6, #0x3f
+    and     r4, r4, #0x3f
+    strneb  r4, [r1], #1
+    ldrh    r4, [r1]
+    orr     r6, r6, r6, lsl #8
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    streqh  r4, [r1]
+    strneb  r4, [r1]
     b       .dtfc_loop
 
 .dtfc_cut_tile:
@@ -837,48 +917,45 @@ DrawTilesFromCache:
     str     r2, [r1]
 
     add     r1, r11,#8
-    mov     r3, #320/4
-    mov     r7, #0x80
-    orr     r7, r7, r7, lsl #8
-    orr     r7, r7, r7, lsl #16
+    mov     r3, #320/4/4
     mov     r6, #0x3f
     orr     r6, r6, r6, lsl #8
     orr     r6, r6, r6, lsl #16
 .dtfc_loop_shprep:
+    ldmia   r1, {r2,r4,r5,r7}
     subs    r3, r3, #1
-    bmi     .dtfc_loop      @ done
-    ldr     r2, [r1]
-    tst     r2, r7
-    andeq   r2, r2, r6
-    streq   r2, [r1], #4
-    beq     .dtfc_loop_shprep
-    tst     r2,     #0x80000000
-    biceq   r2, r2, #0xc0000000
-    tst     r2,     #0x00800000
-    biceq   r2, r2, #0x00c00000
-    tst     r2,     #0x00008000
-    biceq   r2, r2, #0x0000c000
-    tst     r2,     #0x00000080
-    biceq   r2, r2, #0x000000c0
-    str     r2, [r1], #4
-    b       .dtfc_loop_shprep
+    and     r2, r2, r6
+    and     r4, r4, r6
+    and     r5, r5, r6
+    and     r7, r7, r6
+    stmia   r1!,{r2,r4,r5,r7}
+    bne     .dtfc_loop_shprep
+
+    mvn     r5, #0         @ r5=prevcode=-1
+    b       .dtfc_loop
 
 .pool
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 
-.global DrawSpritesFromCache @ int *hc, int sh
+.global DrawSpritesFromCache @ int *hc, int maxwidth, int prio, int sh
 
 DrawSpritesFromCache:
     stmfd   sp!, {r4-r11,lr}
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+    ldr     r11,[r11]
+.else
     ldr     r11,=HighCol
+    mov     r12,#0xf
+.endif
     ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
-    mov     r6, r1, lsl #31
+    mov     r6, r3, lsl #31
     orr     r6, r6, #1<<30
-    mov     r12,#0xf
 
     mov     r10, r0
 
@@ -964,6 +1041,9 @@ DrawSpritesFromCache:
     b       .dsfc_inloop
 
 .dsfc_shadow:
+    tst     r9, #0x80000000
+    beq     .dsfc_shadow_lowpri
+
     cmp     r2, r2, ror #4
     beq     .dsfc_singlec_sh
 
@@ -991,6 +1071,18 @@ DrawSpritesFromCache:
     TileSingleSh
     b       .dsfc_inloop
 
+.dsfc_shadow_lowpri:
+    tst     r9, #0x10000
+    bne     .dsfc_TileFlip_sh_lp
+
+.dsfc_TileNorm_sh_lp:
+    TileNormSh_onlyop_lp
+    b       .dsfc_inloop
+
+.dsfc_TileFlip_sh_lp:
+    TileFlipSh_onlyop_lp
+    b       .dsfc_inloop
+
 .pool
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@ -998,11 +1090,12 @@ DrawSpritesFromCache:
 @ + 0  :    hhhhvvvv ab--hhvv yyyyyyyy yyyyyyyy // a: offscreen h, b: offs. v, h: horiz. size
 @ + 4  :    xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8
 
-.global DrawSprite @ unsigned int *sprite, int **hc, int sh
+.global DrawSprite @ unsigned int *sprite, int sh, int acc_sprites
 
 DrawSprite:
     stmfd   sp!, {r4-r9,r11,lr}
 
+    orr     r8, r2, r1, lsl #4
     ldr     r3, [r0]        @ sprite[0]
     ldr     r7, =Scanline
     mov     r6, r3, lsr #28
@@ -1016,44 +1109,53 @@ DrawSprite:
     ldr     r9, [r0, #4]
     sub     r7, r7, r4, asr #16 @ r7=row=Scanline-sy
 
-    tst     r2, r2
     mov     r2, r9, asr #16 @ r2=sx
-    bic     r9, r9, #0xfe000000
-    orrne   r9, r9, #1<<31  @ r9=code|(sh<<31)
+    mov     r9, r9, lsl #16
+    mov     r9, r9, lsr #16
+    orr     r9, r9, r8, lsl #27 @ r9=code|sh[31]|as[27]
 
     tst     r9, #0x1000
     movne   r4, r5, lsl #3
     subne   r4, r4, #1
     subne   r7, r4, r7      @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y
 
-    mov     r8, r9, lsl #21
-    mov     r8, r8, lsr #21
-    add     r8, r8, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down
-    
+    add     r8, r9, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down
     tst     r9, #0x0800
     mlane   r8, r5, r6, r8  @ if (code&0x0800) { tile+=delta*(width-1);
     rsbne   r5, r5, #0      @ delta=-delta; } // r5=delta now
 
-    mov     r8, r8, lsl #4
+    mov     r8, r8, lsl #21
+    mov     r8, r8, lsr #17
     and     r7, r7, #7
     add     r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address
 
     tst     r9, #0x8000
-    bne     .dspr_cache       @ if(code&0x8000) // high priority - cache it
-
+    tsteq   r9, #(1<<27)
+    bne     .dspr_cache       @ if(code&0x8000) || as
+    tst     r6, #0x4000
+    tstne   r6, #0x2000
+    tstne   r9, #(1<<31)
+    bne     .dspr_cache       @ (sh && pal == 0x30)
+
+.dspr_continue:
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
     ldr     r11,=HighCol
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
     mov     r12,#0xf
+    ldr     r11,[r11]
+.else
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+.endif
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
 
     mov     r5, r5, lsl #4     @ delta<<=4; // Delta of address
     and     r4, r9, #0x6000
     orr     r9, r9, r4, lsl #16
-    orr     r9, r9, #0x10000000 @ r9=scc1 ???? ... <code> (s=shadow/hilight, cc=pal)
+    orrs    r9, r9, #0x10000000 @ r9=scc1 a??? ... <code> (s=shadow/hilight, cc=pal, a=acc_spr)
 
-    tst     r9, #1<<31
     mov     r3, r4, lsr #9     @ r3=pal=((code>>9)&0x30);
-    orrne   r3, r3, #0x40      @ shadow by default
+    orrmi   r3, r3, #0x40      @ shadow by default
 
     add     r6, r6, #1         @ inc now
     adds    r0, r2, #0         @ mov sx to r0 and set ZV flags
@@ -1097,6 +1199,10 @@ DrawSprite:
     TileFlip r12
     b       .dspr_loop
 
+.dspr_singlec_sh:
+    cmp     r2, #0xe0000000
+    bcs     .dspr_loop          @ operator tileline, ignore
+
 .dspr_SingleColor:
     and     r4, r2, #0xf
     orr     r4, r3, r4
@@ -1119,44 +1225,38 @@ DrawSprite:
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
 .dspr_TileNorm_sh:
-    TileNormSh
+    TileNormSh_noop
     b       .dspr_loop
 
 .dspr_TileFlip_sh:
-    TileFlipSh
-    b       .dspr_loop
-
-.dspr_singlec_sh:
-    cmp     r2, #0xe0000000
-    bcc     .dspr_SingleColor   @ normal tileline
-    tst     r2, #0x10000000
-    bne     .dspr_sh_sh
-    TileSingleHi
-    b       .dspr_loop
-
-.dspr_sh_sh:
-    TileSingleSh
+    TileFlipSh_noop
     b       .dspr_loop
 
 
 .dspr_cache:
-    @ *(*hc)++ = (tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|((code>>9)&0x30)|((sprite[0]>>24)&0xf);
+    @ *HighCacheS_ptr++ = ((code&0x8000)<<16)|(tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|pal|((sprite[0]>>16)&0xf);
+    ldr     r1, =HighCacheS_ptr
     mov     r4, r8, lsl #16     @ tile
     tst     r9, #0x0800
     orrne   r4, r4, #0x10000    @ code&0x0800
-    mov     r2, r2, lsl #22
-    orr     r4, r4, r2, lsr #16 @ (sx<<6)&0x0000ffc0
-    and     r2, r9, #0x6000
-    orr     r4, r4, r2, lsr #9  @ (code>>9)&0x30
+    mov     r0, r2, lsl #22
+    orr     r4, r4, r0, lsr #16 @ (sx<<6)&0x0000ffc0
+    and     r0, r9, #0x6000
+    orr     r4, r4, r0, lsr #9  @ (code>>9)&0x30
     mov     r3, r3, lsl #12
-    ldr     r2, [r1]
     orr     r4, r4, r3, lsr #28 @ (sprite[0]>>24)&0xf
 
-    str     r4, [r2], #4
-    str     r2, [r1]
+    ldr     r0, [r1]
+    tst     r9, #0x8000
+    orrne   r4, r4, #0x80000000 @ prio
 
-    ldmfd   sp!, {r4-r9,r11,lr}
-    bx      lr
+    str     r4, [r0], #4
+    str     r0, [r1]
+
+    and     r0, r9, #(1<<27)    @ as
+    teqne   r0,     #(1<<27)    @ (code&0x8000) && !as
+    ldmnefd sp!, {r4-r9,r11,pc}
+    b       .dspr_continue      @ draw anyway if accurate sprites enabled
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
@@ -1186,33 +1286,38 @@ DrawWindow:
 
     ldr     r6, =rendstatus
     ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
-    ldrb    r6, [r6]
+    ldr     r6, [r6]
 
     @ fetch the first code now
     ldrh    r7, [lr, r12]
 
     ands    r6, r6, #2            @ we care about bit 1 only
     orr     r6, r6, r2
-    bne     .dw_no_sameprio
 
-    cmp     r2, r7, lsr #15
-    ldmnefd sp!, {r4-r11,pc}      @ assume that whole window uses same priority
+    teqne   r2, r7, lsr #15       @ do prio bits differ?
+    ldmnefd sp!, {r4-r11,pc}      @ yes, assume that whole window uses same priority
 
-.dw_no_sameprio:
     orr     r6, r6, r3, lsl #8    @ shadow mode
 
     sub     r8, r1, r0
-    mov     r8, r8, lsl #1        @ cells
-
-    mvn     r9, #0                @ r9=prevcode=-1
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r8, r8, lsl #1        @ cells
+    ldr     r11,[r11]
+    mvn     r9, #0                @ r9=prevcode=-1
+    add     r11,r11,#8
+.else
     ldr     r11,=(HighCol+8)
-    add     r1, r11, r0, lsl #4 @ r1=pdest
+    mov     r8, r8, lsl #1        @ cells
+    mvn     r9, #0                @ r9=prevcode=-1
+.endif
+    add     r1, r11, r0, lsl #4   @ r1=pdest
     mov     r0, #0xf
     b       .dwloop_enter
 
-    @ r4,r5 & r7 are scratch in this loop
+    @ r4,r5 are scratch in this loop
 .dwloop:
     add     r1, r1, #8
 .dwloop_nor1:
@@ -1278,24 +1383,13 @@ DrawWindow:
     orreq   r3, r3, #0x40
     beq     .dw_shadow_done
     ldr     r4, [r1]
-    tst     r4, #0x00000080
-    biceq   r4, r4, #0x000000c0
-    tst     r4, #0x00008000
-    biceq   r4, r4, #0x0000c000
-    tst     r4, #0x00800000
-    biceq   r4, r4, #0x00c00000
-    tst     r4, #0x80000000
-    biceq   r4, r4, #0xc0000000
+    mov     r5, #0x3f
+    orr     r5, r5, r5, lsl #8
+    orr     r5, r5, r5, lsl #16
+    and     r4, r4, r5
     str     r4, [r1]
     ldr     r4, [r1,#4]
-    tst     r4, #0x00000080
-    biceq   r4, r4, #0x000000c0
-    tst     r4, #0x00008000
-    biceq   r4, r4, #0x0000c000
-    tst     r4, #0x00800000
-    biceq   r4, r4, #0x00c00000
-    tst     r4, #0x80000000
-    biceq   r4, r4, #0xc0000000
+    and     r4, r4, r5
     str     r4, [r1,#4]
     b       .dw_shadow_done
 
@@ -1405,15 +1499,29 @@ FinalizeLineBGR444:
     bne     .fl_loopcpBGR444_hi
 
     sub     r3, r4, #0x40*3*2
+    mov     r6, #1
 
 
 .fl_noshBGR444:
-    ldr     r1, =(HighCol+8)
+    ldr     r12,=rendstatus
+    eors    r6, r6, #1          @ sh is 0
+    ldr     r12,[r12]
     mov     lr, #0xff
+    tstne   r12,#(1<<2)         @ and PDRAW_ACC_SPRITES
+
+.if OVERRIDE_HIGHCOL
+    ldr     r1, =HighCol
+    movne   lr, #0x3f
+    ldr     r1, [r1]
     mov     lr, lr, lsl #1
+    add     r1, r1, #8
+.else
+    ldr     r1, =(HighCol+8)
+    movne   lr, #0x3f
+    mov     lr, lr, lsl #1
+.endif
 
 .fl_loopBGR444:
-
     ldr     r12, [r1], #4
     subs    r2, r2, #1
 
@@ -1423,11 +1531,10 @@ FinalizeLineBGR444:
     ldrh    r5, [r3, r5]
     and     r6, lr, r12, lsr #15
     ldrh    r6, [r3, r6]
+    and     r12,lr, r12, lsr #23
+    ldrh    r12,[r3, r12]              @ 1c.i.
     orr     r4, r4, r5, lsl #16
-
-    and     r5, lr, r12, lsr #23
-    ldrh    r5, [r3, r5]              @ 2c.i.
-    orr     r5, r6, r5, lsl #16
+    orr     r5, r6, r12,lsl #16
 
     stmia   r0!, {r4,r5}
     bne     .fl_loopBGR444
@@ -1480,14 +1587,16 @@ FinalizeLineBGR444:
     orr     \reg, \reg, r3           @ add blue back
 .endm
 
+.global vidConvCpyRGB565
+
 vidConvCpyRGB565: @ void *to, void *from, int pixels
     stmfd   sp!, {r4-r9,lr}
 
-    mov     r12, r2, lsr #3 @ repeats
+    mov     r12, r2, lsr #3  @ repeats
     mov     lr, #0x001c0000
     orr     lr, lr,  #0x01c  @ lr == pattern 0x001c001c
     mov     r8, #0x00030000
-    orr     r8, r8,  #0x003  @ lr == pattern 0x001c001c
+    orr     r8, r8,  #0x003
 
 .loopRGB565:
     ldmia   r1!, {r4-r7}
@@ -1560,14 +1669,29 @@ FinalizeLineRGB555:
     bne     .fl_loopcpRGB555_hi
 
     sub     r3, r3, #0x40*2
+    mov     r6, #1
 
 .fl_noshRGB555:
+    ldr     r12,=rendstatus
+    eors    r6, r6, #1          @ sh is 0
+    ldr     r12,[r12]
+    mov     lr, #0xff
+    tstne   r12,#(1<<2)         @ and PDRAW_ACC_SPRITES
+    movne   lr, #0x3f
+
+.if OVERRIDE_HIGHCOL
+    ldr     r1, =HighCol
+    ldr     r0, =DrawLineDest
+    ldr     r1, [r1]
+    ldr     r0, [r0]
+    add     r1, r1, #8
+.else
     ldr     r0, =DrawLineDest
     ldr     r1, =(HighCol+8)
     ldr     r0, [r0]
+.endif
 
     ldrb    r12, [r8, #12]
-    mov     lr, #0xff
     mov     lr, lr, lsl #1
 
     tst     r12, #1
@@ -1582,8 +1706,14 @@ FinalizeLineRGB555:
     addeq   r0, r0, #32*2
 
 .fl_no32colRGB555:
-.fl_loopRGB555:
 
+.if UNALIGNED_DRAWLINEDEST
+    @ this is basically for Gizmondo, which has unaligned odd lines in the framebuffer
+    tst     r0, #2
+    bne     .fl_RGB555u
+.endif
+
+.fl_loopRGB555:
     ldr     r12, [r1], #4
     ldr     r7,  [r1], #4
 
@@ -1605,12 +1735,12 @@ FinalizeLineRGB555:
     ldrh    r6, [r3, r6]
     and     r12,lr, r7, lsr #15
     ldrh    r12,[r3, r12]
+    and     r7, lr, r7, lsr #23
+    ldrh    r7, [r3, r7]
     orr     r8, r8, r6, lsl #16
 
-    and     r6, lr, r7, lsr #23
-    ldrh    r6, [r3, r6]             @ 1 cycle interlock here (r6)
     subs    r2, r2, #1
-    orr     r12,r12, r6, lsl #16
+    orr     r12,r12, r7, lsl #16
 
     stmia   r0!, {r4,r5,r8,r12}
     bne     .fl_loopRGB555
@@ -1624,6 +1754,11 @@ FinalizeLineRGB555:
     mov     r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
     orr     r9, r9, #0x00e7
 
+.if UNALIGNED_DRAWLINEDEST
+    tst     r0, #2
+    bne     .fl_32scale_RGB555u
+.endif
+
 .fl_loop32scale_RGB555:
     ldr     r12, [r1], #4
     ldr     r7,  [r1], #4
@@ -1680,6 +1815,121 @@ FinalizeLineRGB555:
     ldmfd   sp!, {r4-r8,lr}
     bx      lr
 
+.if UNALIGNED_DRAWLINEDEST
+    @ unaligned versions of loops
+    @ warning: starts drawing 2bytes before dst
+
+.fl_RGB555u:
+    sub     r0, r0, #2              @ initial adjustment
+    mov     r8, #0
+
+.fl_loopRGB555u:
+    ldr     r12, [r1], #4
+    ldr     r7,  [r1], #4
+
+    and     r6, lr, r12,lsl #1
+    ldrh    r6, [r3, r6]
+    and     r5, lr, r12,lsr #7
+    ldrh    r5, [r3, r5]
+    orr     r4, r8, r6, lsl #16
+
+    and     r6, lr, r12,lsr #15
+    ldrh    r6, [r3, r6]
+    and     r8, lr, r12,lsr #23
+    ldrh    r8, [r3, r8]
+    orr     r5, r5, r6, lsl #16
+
+    and     r6, lr, r7, lsl #1
+    ldrh    r6, [r3, r6]
+    and     r12,lr, r7, lsr #7
+    ldrh    r12,[r3, r12]
+    orr     r6, r8, r6, lsl #16
+
+    and     r8, lr, r7, lsr #15
+    ldrh    r8, [r3, r8]
+    and     r7, lr, r7, lsr #23
+
+    subs    r2, r2, #1
+    orr     r12,r12,r8, lsl #16
+    ldrh    r8, [r3, r7]
+
+    stmia   r0!, {r4,r5,r6,r12}
+    bne     .fl_loopRGB555u
+
+    strh    r8, [r0], #2
+
+    ldmfd   sp!, {r4-r8,lr}
+    bx      lr
+
+
+.fl_32scale_RGB555u:
+    sub     r0, r0, #2              @ initial adjustment
+    mov     r4, #0
+
+    @ r9  f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
+.fl_loop32scale_RGB555u:
+    ldr     r12, [r1], #4
+    ldr     r7,  [r1], #4
+
+    and     r6, lr, r12,lsl #1
+    ldrh    r6, [r3, r6]
+    and     r5, lr, r12,lsr #7
+    ldrh    r5, [r3, r5]
+    and     r6, r6, r9, lsl #2
+    orr     r4, r4, r6, lsl #16       @ r4 = pix_d -1, 0
+
+    and     r5, r5, r9, lsl #2
+    sub     r8, r5, r5, lsr #2        @ r8 = 3/4 pix_s 1
+    add     r6, r8, r6, lsr #2        @ r6 = (1/4 pix_s 0) + (3/4 pix_s 1)
+    orr     r5, r6, r5, lsl #15
+
+    and     r6, lr, r12,lsr #15
+    ldrh    r6, [r3, r6]
+    and     r12,lr, r12,lsr #23
+    ldrh    r12,[r3, r12]
+    and     r6, r6, r9, lsl #2
+    add     r5, r5, r6, lsl #15       @ r5 = pix_d 1, 2
+
+    and     r8, lr, r7, lsl #1
+    ldrh    r8, [r3, r8]
+    and     r10,lr, r7, lsr #7
+    ldrh    r10,[r3, r10]
+    and     r12,r12,r9, lsl #2
+    sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2
+    add     r6, r6, r12,lsr #2
+    orr     r6, r6, r12,lsl #16       @ r6 = pix_d 3, 4
+
+    and     r8, r8, r9, lsl #2
+    and     r10,r10,r9, lsl #2
+    sub     r12,r10,r10,lsr #2        @ r12 = 3/4 pix_s 5
+    orr     r8, r8, r8, lsl #14
+    add     r8, r8, r12,lsl #16       @ r8 = pix_d 5, 6
+    and     r12,lr, r7, lsr #15
+    ldrh    r12,[r3, r12]
+    and     r7, lr, r7, lsr #23
+    ldrh    r7, [r3, r7]
+    and     r12,r12,r9, lsl #2
+    add     r10,r10,r12
+    mov     r10,r10,    lsr #1
+    sub     r12,r12,r12,lsr #2        @ r12 = 3/4 pix_s 6
+    orr     r10,r10,r12,lsl #16
+    and     r7, r7, r9, lsl #2
+    add     r10,r10,r7, lsl #14       @ r10 = pix_d 7, 8
+
+    subs    r2, r2, #1
+
+    stmia   r0!, {r4,r5,r6,r8,r10}
+    mov     r4, r7
+    bne     .fl_loop32scale_RGB555u
+
+    strh    r4, [r0], #2
+
+    ldmfd   sp!, {r9,r10}
+    ldmfd   sp!, {r4-r8,lr}
+    bx      lr
+
+.endif @ UNALIGNED_DRAWLINEDEST
+
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@