X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=Pico%2FDraw.s;h=f31025362dfd6b5e721d6e2ec2c035959acb48f6;hb=07abbab17a9baab5eeabe30767b0336326049994;hp=7643c3db01b8b4d30f8926d8c9f03a2ec32ba01e;hpb=6cadc2da0070781cf2d8fcff84265d3ca1f423b9;p=picodrive.git

diff --git a/Pico/Draw.s b/Pico/Draw.s
index 7643c3db..f3102536 100644
--- a/Pico/Draw.s
+++ b/Pico/Draw.s
@@ -1,11 +1,12 @@
 @ vim:filetype=armasm
 
-@ assembly "optimized" version of some funtions from draw.c
+@ ARM assembly versions of some funtions from draw.c
 @ this is highly specialized, be careful if changing related C code!
 
-@ (c) Copyright 2007, Grazvydas "notaz" Ignotas
+@ (c) Copyright 2007-2008, Grazvydas "notaz" Ignotas
 @ All Rights Reserved
 
+.include "port_config.s"
 
 .extern Pico
 .extern PicoOpt
@@ -14,8 +15,8 @@
 .extern HighSprZ
 .extern rendstatus
 .extern DrawLineDest
-.extern DrawStripVSRam
 .extern DrawStripInterlace
+.extern HighCacheS_ptr
 
 
 @ helper
@@ -64,13 +65,11 @@
 .endif
     ldreqb  r4, [r1,#\offs]
     orrne   r4, r3, r4
-    strneb  r4, [r1,#\offs]
-    tsteq   r4, #0x80
     andeq   r4, r4, #0x3f
-    streqb  r4, [r1,#\offs]
+    strb    r4, [r1,#\offs]
 .endm
 
-@ TileNorm (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits
+@ TileNormShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits
 .macro TileNormShHP
     TilePixelShHP 12, 0         @ #0x0000f000
     TilePixelShHP  8, 1         @ #0x00000f00
@@ -82,7 +81,7 @@
     TilePixelShHP 16, 7         @ #0x000f0000
 .endm
 
-@ TileFlip (r1=pdest, r2=pixels8, r3=pal) r4: scratch, pat: register with helper pattern 0xf
+@ TileFlipShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, pat: register with helper pattern 0xf
 .macro TileFlipShHP
     TilePixelShHP 16, 0         @ #0x000f0000
     TilePixelShHP 20, 1         @ #0x00f00000
@@ -155,24 +154,17 @@
 .else
     ands    r4, r12, r2
 .endif
-    beq     3f
+    beq     0f
     cmp     r4, #0xe
-    beq     2f
-    bgt     1f
-    orr     r4, r3, r4
-    strb    r4, [r1,#\ofs]
-    b       3f
-1:
-    ldrb    r4, [r1,#\ofs]        @ 2ci
-    orr     r4, r4, #0xc0
-    strb    r4, [r1,#\ofs]
-    b       3f
-2:
-    ldrb    r4, [r1,#\ofs]        @ 2ci
-    bic     r4, r4, #0xc0
-    orr     r4, r4, #0x80
+    ldrgeb  r4, [r1,#\ofs]
+    orrlt   r4, r3, r4            @ normal
+
+    biceq   r4, r4, #0xc0         @ hilight
+    orreq   r4, r4, #0x80
+    orrgt   r4, r4, #0xc0         @ shadow
+
     strb    r4, [r1,#\ofs]
-3:
+0:
 .endm
 
 @ TileFlipSh (r1=pdest, r2=pixels8, r3=pal) r4,r7: scratch, r0=sx, r12: register with helper pattern 0xf
@@ -199,6 +191,80 @@
     TileDoShGenPixel 16,  7 @ #0x000f0000
 .endm
 
+.macro TileDoShGenPixel_noop shift ofs
+.if \shift
+    and     r4, r12, r2, lsr #\shift
+.else
+    and     r4, r12, r2
+.endif
+    sub     r7, r4, #1
+    cmp     r7, #0xd
+    orrcc   r4, r3, r4           @ 0-0xc (was 1-0xd)
+    strccb  r4, [r1,#\ofs]
+.endm
+
+.macro TileFlipSh_noop
+    TileDoShGenPixel_noop 16,  0 @ #0x000f0000
+    TileDoShGenPixel_noop 20,  1 @ #0x00f00000
+    TileDoShGenPixel_noop 24,  2 @ #0x0f000000
+    TileDoShGenPixel_noop 28,  3 @ #0xf0000000
+    TileDoShGenPixel_noop  0,  4 @ #0x0000000f
+    TileDoShGenPixel_noop  4,  5 @ #0x000000f0
+    TileDoShGenPixel_noop  8,  6 @ #0x00000f00
+    TileDoShGenPixel_noop 12,  7 @ #0x0000f000
+.endm
+
+.macro TileNormSh_noop
+    TileDoShGenPixel_noop 12,  0 @ #0x0000f000
+    TileDoShGenPixel_noop  8,  1 @ #0x00000f00
+    TileDoShGenPixel_noop  4,  2 @ #0x000000f0
+    TileDoShGenPixel_noop  0,  3 @ #0x0000000f
+    TileDoShGenPixel_noop 28,  4 @ #0xf0000000
+    TileDoShGenPixel_noop 24,  5 @ #0x0f000000
+    TileDoShGenPixel_noop 20,  6 @ #0x00f00000
+    TileDoShGenPixel_noop 16,  7 @ #0x000f0000
+.endm
+
+.macro TileDoShGenPixel_onlyop_lp shift ofs
+.if \shift
+    ands    r7, r12, r2, lsr #\shift
+.else
+    ands    r7, r12, r2
+.endif
+    ldrneb  r4, [r1,#\ofs]
+    tstne   r4, #0x40
+    beq     0f
+
+    cmp     r7, #0xe
+    biceq   r4, r4, #0xc0         @ hilight
+    orreq   r4, r4, #0x80
+    orrgt   r4, r4, #0xc0         @ shadow
+    strgeb  r4, [r1,#\ofs]
+0:
+.endm
+
+.macro TileFlipSh_onlyop_lp
+    TileDoShGenPixel_onlyop_lp 16,  0 @ #0x000f0000
+    TileDoShGenPixel_onlyop_lp 20,  1 @ #0x00f00000
+    TileDoShGenPixel_onlyop_lp 24,  2 @ #0x0f000000
+    TileDoShGenPixel_onlyop_lp 28,  3 @ #0xf0000000
+    TileDoShGenPixel_onlyop_lp  0,  4 @ #0x0000000f
+    TileDoShGenPixel_onlyop_lp  4,  5 @ #0x000000f0
+    TileDoShGenPixel_onlyop_lp  8,  6 @ #0x00000f00
+    TileDoShGenPixel_onlyop_lp 12,  7 @ #0x0000f000
+.endm
+
+.macro TileNormSh_onlyop_lp
+    TileDoShGenPixel_onlyop_lp 12,  0 @ #0x0000f000
+    TileDoShGenPixel_onlyop_lp  8,  1 @ #0x00000f00
+    TileDoShGenPixel_onlyop_lp  4,  2 @ #0x000000f0
+    TileDoShGenPixel_onlyop_lp  0,  3 @ #0x0000000f
+    TileDoShGenPixel_onlyop_lp 28,  4 @ #0xf0000000
+    TileDoShGenPixel_onlyop_lp 24,  5 @ #0x0f000000
+    TileDoShGenPixel_onlyop_lp 20,  6 @ #0x00f00000
+    TileDoShGenPixel_onlyop_lp 16,  7 @ #0x000f0000
+.endm
+
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
@@ -212,9 +278,9 @@
 @   int cells;   // 0x14
 @ };
 
-@ int DrawLayer(int plane, int *hcache, int maxcells, int sh)
+@ void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells);
 
-.global DrawLayer @ int plane, int *hcache, int maxcells, int sh
+.global DrawLayer
 
 DrawLayer:
     stmfd   sp!, {r4-r11,lr}
@@ -222,10 +288,11 @@ DrawLayer:
     ldr     r11, =(Pico+0x22228)  @ Pico.video
     mov     r8, #1
 
-    ldrb    r7, [r11, #16]        @ ??hh??ww
+    ldrb    r7, [r11, #16]        @ ??vv??hh
 
     mov     r6, r1                @ hcache
-    orr     r9, r2, r3, lsl #31   @ r9=maxcells|(sh<<31)
+    orr     r9, r3, r0, lsl #30
+    orr     r9, r9, r2, lsl #8    @ r9=sh[31]|cellskip[15:8]|maxcells[7:0]  (tmp)
 
     mov     r1, r7, lsl #4
     orr     r1, r1, #0x00ff
@@ -245,7 +312,7 @@ DrawLayer:
     sub     r5, r5, #1            @ r5=xmask
 
     @ Find name table:
-    tst     r0,  r0
+    ands    r0,  r0, #1
     ldreqb  r12, [r11, #2]
     ldrneb  r12, [r11, #4]
 
@@ -293,7 +360,6 @@ DrawLayer:
     add     r12, r12, r4, lsl r10  @ nametab+=(ts.line>>3)<<shift[width];
 
     @ ldmia   r0, {r1,r2,r3,r5,r6,r9} @ r2=line, r3=ts->hscroll, r5=ts->xmask, r6=ts->hc, r9=ts->cells
-@    mov     r12,r1,  lsl #1 @ r12=(ts->nametab<<1) (halfword compliant)
 
     and     r10,r2,  #7
     mov     r10,r10, lsl #1 @ r10=ty=(ts->line&7)<<1;
@@ -308,17 +374,28 @@ DrawLayer:
 
     tst     r9, #1<<31
     mov     r3, #0
-    orrne   r10,r10, #1<<23 @ r10=(cells<<24|sh<<23|hi_not_empty<<22|ty)
+    orrne   r10,r10, #1<<23 @ r10=(cells<<24|sh<<23|hi_not_empty<<22|had_output<<21|ty)
     movne   r3, #0x40       @ default to shadowed pal on sh mode
 
-    mvn     r9, #0          @ r9=prevcode=-1
-
     cmp     r7, #8
     addne   r10,r10, #0x01000000 @ we will loop cells+1 times if there is scroll
 
+    and     r9, r9, #0xff00
+    add     r8, r8, r9, lsr #8   @ tilex+=cellskip
+    add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;
+    sub     r10,r10,r9, lsl #16  @ cells-=cellskip
+
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r0, #0xf
+    ldr     r11,[r11]
+.else
     ldr     r11,=HighCol
     mov     r0, #0xf
+.endif
+
+    mvn     r9, #0               @ r9=prevcode=-1
     add     r1, r11, r7         @ r1=pdest
 
 
@@ -344,6 +421,7 @@ DrawLayer:
     beq     .DrawStrip_samecode @ we know stuff about this tile already
 
     mov     r9, r7          @ remember code
+    orr     r10, r10, #1<<21 @ seen non hi-prio tile
 
     movs    r2, r9, lsl #20 @ if (code&0x1000)
     mov     r2, r2, lsl #1
@@ -365,16 +443,17 @@ DrawLayer:
     beq     .DrawStrip_SingleColor @ tileline singlecolor 
 
     tst     r9, #0x0800
-    beq     .DrawStrip_TileNorm
+    bne     .DrawStrip_TileFlip
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern
-    TileFlip r0
-    b       .dsloop
-
 .DrawStrip_TileNorm:
     TileNorm r0
     b       .dsloop
 
+.DrawStrip_TileFlip:
+    TileFlip r0
+    b       .dsloop
+
 .DrawStrip_SingleColor:
     and     r4, r2, #0xf
     orr     r4, r3, r4
@@ -388,6 +467,20 @@ DrawLayer:
     strneb  r4, [r1], #1       @ have a remaining unaligned pixel?
     b       .dsloop_subr1
 
+.DrawStrip_hiprio_maybempt:
+    cmp     r7, r9
+    beq     .dsloop         @ must've been empty, otherwise we wouldn't get here
+    movs    r2, r7, lsl #20 @ if (code&0x1000)
+    mov     r2, r2, lsl #1
+    add     r2, r2, r10, lsl #17
+    mov     r2, r2, lsr #17
+    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
+    mov     r9, r7          @ remember code
+    tst     r2, r2
+    beq     .dsloop
+    orr     r10, r10, #1<<22
+
 .DrawStrip_hiprio:
     tst     r10, #0x00c00000
     beq     .DrawStrip_hiprio_maybempt
@@ -400,9 +493,169 @@ DrawLayer:
     mov     r0, #0xf
     b       .dsloop
 
-.DrawStrip_hiprio_maybempt:
+.dsloop_exit:
+    tst     r10, #1<<21 @ seen non hi-prio tile
+    ldreq   r1, =rendstatus
+    mov     r0, #0
+    ldreq   r2, [r1]
+    str     r0, [r6]    @ terminate the cache list
+    orreq   r2, r2, #0x40 @ had a layer with all hi-prio tiles
+    streq   r2, [r1]
+
+    ldmfd   sp!, {r4-r11,lr}
+    bx      lr
+
+@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+.DrawStrip_vsscroll:
+    rsb     r8, r3, #0
+    mov     r8, r8, lsr #3        @ r8=tilex=(-ts->hscroll)>>3
+    bic     r8, r8, #0x3fc00000
+    orr     r8, r8, r5, lsl #25   @ r8=(xmask[31:25]|had_output[24]|tilex[21:0])
+
+    ldr     r4, =Scanline
+    orr     r5, r1, r10, lsl #24
+    ldr     r4, [r4]
+    sub     r1, r3, #1
+    orr     r5, r5, r4, lsl #16   @ r5=(shift_width[31:24]|scanline[23:16]|ymask[15:0])
+    and     r1, r1, #7
+    add     r7, r1, #1            @ r7=dx=((ts->hscroll-1)&7)+1
+
+    mov     r10,r9, lsl #16
+    tst     r0, #1
+    orrne   r10,r10, #0x8000
+    tst     r9, #1<<31
+    mov     r3, #0
+    orr     r10,r10, #0xff000000 @ will be adjusted on entering loop
+    orrne   r10,r10, #1<<23 @ r10=(cell[31:24]|sh[23]|hi_not_empty[22]|cells_max[21:16]|plane[15]|ty[14:0])
+    movne   r3, #0x40       @ default to shadowed pal on sh mode
+
+    cmp     r7, #8
+    subne   r10,r10, #0x01000000 @ have hscroll, start with negative cell
+
+    and     r9, r9, #0xff00
+    add     r8, r8, r9, lsr #8   @ tilex+=cellskip
+    add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;
+    add     r10,r10,r9, lsl #16  @ cell+=cellskip
+
+    @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r0, #0xf
+    ldr     r11,[r11]
+.else
+    ldr     r11,=HighCol
+    mov     r0, #0xf
+.endif
+
+    mvn     r9, #0               @ r9=prevcode=-1
+    add     r1, r11, r7          @ r1=pdest
+
+    @ r4 & r7 are scratch in this loop
+.dsloop_vs_subr1:
+    sub     r1, r1, #8
+.dsloop_vs: @ 40-41 times
+    add     r10,r10, #0x01000000
+    and     r4, r10, #0x003f0000
+    cmp     r4, r10, asr #8
+    ble     .dsloop_vs_exit
+
+    @ calc offset and read tileline code to r7, also calc ty
+    add     r7, lr, #0x012000
+    add     r7, r7, #0x000180     @ r7=Pico.vsram (Pico+0x22180)
+    add     r7, r7, r10,asr #23   @ vsram + ((cell&~1)<<1)
+    bic     r7, r7, #3
+    tst     r10,#0x8000           @ plane1?
+    addne   r7, r7, #2
+    ldrh    r7, [r7]              @ r7=vscroll
+
+    bic     r10,r10,#0xff         @ clear old ty
+    and     r4, r5, #0xff0000     @ scanline
+    add     r4, r4, r7, lsl #16   @ ... += vscroll
+    and     r4, r4, r5, lsl #16   @ ... &= ymask
+    and     r7, r4, #0x70000
+    orr     r10,r10,r7, lsr #15   @ new ty
+
+    mov     r4, r4, lsr #19
+    mov     r7, r5, lsr #24
+    mov     r4, r4, lsl r7        @ nametabadd
+
+    and     r7, r8, r8, lsr #25
+    add     r7, lr, r7, lsl #1    @ Pico.vram+((tilex&ts->xmask) as halfwords)
+    add     r7, r7, r4, lsl #1
+    ldrh    r7, [r7, r12]         @ r7=code (int, but from unsigned, no sign extend)
+
+    add     r1, r1, #8
+    add     r8, r8, #1
+
+    tst     r7, #0x8000
+    bne     .DrawStrip_vs_hiprio
+
     cmp     r7, r9
-    beq     .dsloop         @ must've been empty, otherwise we wouldn't get here
+    beq     .DrawStrip_vs_samecode @ we know stuff about this tile already
+
+    mov     r9, r7          @ remember code
+    orr     r8, r8, #(1<<24)@ seen non hi-prio tile
+
+    movs    r2, r9, lsl #20 @ if (code&0x1000)
+    mov     r2, r2, lsl #1
+    add     r2, r2, r10, lsl #17
+    mov     r2, r2, lsr #17
+    eorcs   r2, r2, #0x0e   @ if (code&0x1000) addr^=0xe;
+
+    ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
+
+    bic     r7, r3, #0x3f
+    and     r3, r9, #0x6000
+    add     r3, r7, r3, lsr #9 @ r3=pal=((code&0x6000)>>9);
+
+.DrawStrip_vs_samecode:
+    tst     r2, r2
+    beq     .dsloop_vs              @ tileline blank
+
+    cmp     r2, r2, ror #4
+    beq     .DrawStrip_vs_SingleColor @ tileline singlecolor 
+
+    tst     r9, #0x0800
+    bne     .DrawStrip_vs_TileFlip
+
+    @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern
+.DrawStrip_vs_TileNorm:
+    TileNorm r0
+    b       .dsloop_vs
+
+.DrawStrip_vs_TileFlip:
+    TileFlip r0
+    b       .dsloop_vs
+
+.DrawStrip_vs_SingleColor:
+    and     r4, r2, #0xf
+    orr     r4, r3, r4
+    orr     r4, r4, r4, lsl #8
+    tst     r1, #1             @ not aligned?
+    strneb  r4, [r1], #1
+    streqh  r4, [r1], #2
+    strh    r4, [r1], #2
+    strh    r4, [r1], #2
+    strh    r4, [r1], #2
+    strneb  r4, [r1], #1       @ have a remaining unaligned pixel?
+    b       .dsloop_vs_subr1
+
+.DrawStrip_vs_hiprio:
+    tst     r10, #0x00c00000
+    beq     .DrawStrip_vs_hiprio_maybempt
+    sub     r0, r1, r11
+    orr     r7, r7, r0,  lsl #16
+    orr     r7, r7, r10, lsl #25 @ (ty<<25)
+    tst     r7, #0x1000
+    eorne   r7, r7, #7<<26  @ if(code&0x1000) cval^=7<<26;
+    str     r7, [r6], #4    @ cache hi priority tile
+    mov     r0, #0xf
+    b       .dsloop_vs
+
+.DrawStrip_vs_hiprio_maybempt:
+    cmp     r7, r9
+    beq     .dsloop_vs         @ must've been empty, otherwise we wouldn't get here
     movs    r2, r7, lsl #20 @ if (code&0x1000)
     mov     r2, r2, lsl #1
     add     r2, r2, r10, lsl #17
@@ -412,41 +665,23 @@ DrawLayer:
     mov     r9, r7          @ remember code
     tst     r2, r2
     orrne   r10, r10, #1<<22
-    bne     .DrawStrip_hiprio
-    b       .dsloop
+    bne     .DrawStrip_vs_hiprio
+    b       .dsloop_vs
 
-.dsloop_exit:
+.dsloop_vs_exit:
+    tst     r8, #(1<<24) @ seen non hi-prio tile
+    ldreq   r1, =rendstatus
     mov     r0, #0
+    ldreq   r2, [r1]
     str     r0, [r6]    @ terminate the cache list
+    orreq   r2, r2, #0x40 @ had a layer with all hi-prio tiles
+    streq   r2, [r1]
 
     ldmfd   sp!, {r4-r11,lr}
     bx      lr
 
 
-.DrawStrip_vsscroll:
-    @ shit, we have 2-cell column based vscroll
-    @ let the c code handle this (for now)
-
-    @   int nametab; // 0x00
-    @   int line;    // 0x04
-    @   int hscroll; // 0x08
-    @   int xmask;   // 0x0C
-    @   int *hc;     // 0x10 (pointer to cache buffer)
-    @   int cells;   // 0x14
-
-    sub     sp, sp, #6*4
-    orr     r2, r1, r10, lsl #24 @ ts.line=ymask|(shift[width]<<24); // save some stuff instead of line
-    mov     r1, r0               @ plane
-    mov     r0, r12, lsr #1      @ halfwords
-    and     r9, r9, #0xff
-    stmia   sp, {r0,r2,r3,r5,r6,r9}
-
-    mov     r0, sp
-    bl      DrawStripVSRam @ struct TileStrip *ts, int plane
-
-    add     sp, sp, #6*4
-    ldmfd   sp!, {r4-r11,lr}
-    bx      lr
+@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 @ interlace mode 2? Sonic 2?
 .DrawStrip_interlace:
@@ -484,10 +719,18 @@ DrawLayer:
 BackFill:
     stmfd   sp!, {r4-r9,lr}
 
+.if OVERRIDE_HIGHCOL
+    ldr     lr, =HighCol
+    mov     r0, r0, lsl #26
+    ldr     lr, [lr]
+    mov     r0, r0, lsr #26
+    add     lr, lr, #8
+.else
     ldr     lr, =(HighCol+8)
-
     mov     r0, r0, lsl #26
     mov     r0, r0, lsr #26
+.endif
+
     orr     r0, r0, r1, lsl #6
     orr     r0, r0, r0, lsl #8
     orr     r0, r0, r0, lsl #16
@@ -519,26 +762,34 @@ BackFill:
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 
-.global DrawTilesFromCache @ int *hc, int sh
+.global DrawTilesFromCache @ int *hc, int sh, int rlim
 
 DrawTilesFromCache:
     stmfd   sp!, {r4-r8,r11,lr}
 
-    mvn     r5, #0         @ r5=prevcode=-1
-    mov     r8, r1
-
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
     ldr     r11,=HighCol
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
     mov     r12,#0xf
+    ldr     r11,[r11]
+.else
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+.endif
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
+
+    mvn     r5, #0         @ r5=prevcode=-1
+    ands    r8, r1, #1
+    orr     r8, r8, r2, lsl #1
+    bne     .dtfc_check_rendflags
 
     @ scratch: r4, r7
 .dtfc_loop:
     ldr     r6, [r0], #4    @ read code
     movs    r1, r6, lsr #16 @ r1=dx;
     ldmeqfd sp!, {r4-r8,r11,pc} @ dx is never zero, this must be a terminator, return
-    bic     r1, r1, #0xfe00
-    add     r1, r11, r1     @ r1=pdest
+    bic     r4, r1, #0xfe00
+    add     r1, r11, r4     @ r1=pdest
 
     mov     r7, r6, lsl #16
     cmp     r5, r7, lsr #16
@@ -556,7 +807,10 @@ DrawTilesFromCache:
     ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
 
 .dtfc_samecode:
-    tst     r8, r8
+    rsbs    r4, r4, r8, lsr #1
+    bmi     .dtfc_cut_tile
+
+    tst     r8, #1
     bne     .dtfc_shadow
 
     tst     r2, r2
@@ -566,16 +820,17 @@ DrawTilesFromCache:
     beq     .dtfc_SingleColor @ tileline singlecolor 
 
     tst     r5, #0x0800
-    beq     .dtfc_TileNorm
+    bne     .dtfc_TileFlip
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlip r12
-    b       .dtfc_loop
-
 .dtfc_TileNorm:
     TileNorm r12
     b       .dtfc_loop
 
+.dtfc_TileFlip:
+    TileFlip r12
+    b       .dtfc_loop
+
 .dtfc_SingleColor:
     and     r4, r2, #0xf
     orr     r4, r3, r4
@@ -597,50 +852,86 @@ DrawTilesFromCache:
     beq     .dtfc_SingleColor @ tileline singlecolor 
 
     tst     r5, #0x0800
-    beq     .dtfc_TileNormShHP
+    bne     .dtfc_TileFlipShHP
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlipShHP
-    b       .dtfc_loop
-
 .dtfc_TileNormShHP:
     TileNormShHP
     b       .dtfc_loop
 
+.dtfc_TileFlipShHP:
+    TileFlipShHP
+    b       .dtfc_loop
+
 .dtfc_shadow_blank:
-    ldrb    r4, [r1]        @ 1ci
-    ldrb    r12,[r1,#1]
-    tst     r4, #0x80
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#2]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#1]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#3]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#2]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#4]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#3]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#5]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#4]
-    tst     r12,#0x80
-    ldrb    r4, [r1,#6]
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#5]
-    tst     r4, #0x80
-    ldrb    r12,[r1,#7]
-    andeq   r4, r4,#0x3f
-    streqb  r4, [r1,#6]
-    tst     r12,#0x80
-    andeq   r12,r12,#0x3f
-    streqb  r12,[r1,#7]
-    mov     r12, #0xf
+    tst     r1, #1
+    ldrneb  r4, [r1]
+    mov     r6, #0x3f
+    and     r4, r4, #0x3f
+    strneb  r4, [r1], #1
+    ldrh    r4, [r1]
+    orr     r6, r6, r6, lsl #8
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    strh    r4, [r1], #2
+    ldrh    r4, [r1]
+    and     r4, r4, r6
+    streqh  r4, [r1]
+    strneb  r4, [r1]
+    b       .dtfc_loop
+
+.dtfc_cut_tile:
+    add     r4, r4, #7      @ 0-6
+    mov     r4, r4, lsl #2
+    mov     r12,#0xf<<28
+    mov     r12,r12,asr r4
+    mov     r2, r2, ror #16
+    tst     r5, #0x0800     @ flipped?
+    mvnne   r12,r12
+    and     r2, r2, r12
+    mov     r2, r2, ror #16
+    mov     r12,#0xf
+    tst     r8, #1
+    bne     .dtfc_shadow
+    tst     r2, r2
+    beq     .dtfc_loop
+    tst     r5, #0x0800
+    beq     .dtfc_TileNorm
+    b       .dtfc_TileFlip
+
+@ check if we have detected layer covered with hi-prio tiles:
+.dtfc_check_rendflags:
+    ldr     r1, =rendstatus
+    ldr     r2, [r1]
+    tst     r2, #0xc0
+    beq     .dtfc_loop
+    bic     r8, r8, #1      @ sh/hi mode off
+    tst     r2, #0x80
+    bne     .dtfc_loop      @ already processed
+    orr     r2, r2, #0x80
+    str     r2, [r1]
+
+    add     r1, r11,#8
+    mov     r3, #320/4/4
+    mov     r6, #0x3f
+    orr     r6, r6, r6, lsl #8
+    orr     r6, r6, r6, lsl #16
+.dtfc_loop_shprep:
+    ldmia   r1, {r2,r4,r5,r7}
+    subs    r3, r3, #1
+    and     r2, r2, r6
+    and     r4, r4, r6
+    and     r5, r5, r6
+    and     r7, r7, r6
+    stmia   r1!,{r2,r4,r5,r7}
+    bne     .dtfc_loop_shprep
+
+    mvn     r5, #0         @ r5=prevcode=-1
     b       .dtfc_loop
 
 .pool
@@ -648,17 +939,23 @@ DrawTilesFromCache:
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 
-.global DrawSpritesFromCache @ int *hc, int sh
+.global DrawSpritesFromCache @ int *hc, int maxwidth, int prio, int sh
 
 DrawSpritesFromCache:
     stmfd   sp!, {r4-r11,lr}
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+    ldr     r11,[r11]
+.else
     ldr     r11,=HighCol
+    mov     r12,#0xf
+.endif
     ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
-    mov     r6, r1, lsl #31
+    mov     r6, r3, lsl #31
     orr     r6, r6, #1<<30
-    mov     r12,#0xf
 
     mov     r10, r0
 
@@ -719,16 +1016,17 @@ DrawSpritesFromCache:
     beq     .dsfc_SingleColor @ tileline singlecolor 
 
     tst     r9, #0x10000
-    beq     .dsfc_TileNorm
+    bne     .dsfc_TileFlip
 
     @ TileFlip (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlip r12
-    b       .dsfc_inloop
-
 .dsfc_TileNorm:
     TileNorm r12
     b       .dsfc_inloop
 
+.dsfc_TileFlip:
+    TileFlip r12
+    b       .dsfc_inloop
+
 .dsfc_SingleColor:
     tst     r0, #1              @ not aligned?
     and     r4, r2, #0xf
@@ -743,20 +1041,24 @@ DrawSpritesFromCache:
     b       .dsfc_inloop
 
 .dsfc_shadow:
+    tst     r9, #0x80000000
+    beq     .dsfc_shadow_lowpri
+
     cmp     r2, r2, ror #4
     beq     .dsfc_singlec_sh
 
     tst     r9, #0x10000
-    beq     .dsfc_TileNorm_sh
+    bne     .dsfc_TileFlip_sh
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlipSh
-    b       .dsfc_inloop
-
 .dsfc_TileNorm_sh:
     TileNormSh
     b       .dsfc_inloop
 
+.dsfc_TileFlip_sh:
+    TileFlipSh
+    b       .dsfc_inloop
+
 .dsfc_singlec_sh:
     cmp     r2, #0xe0000000
     bcc     .dsfc_SingleColor   @ normal singlecolor tileline (carry inverted in ARM)
@@ -769,6 +1071,18 @@ DrawSpritesFromCache:
     TileSingleSh
     b       .dsfc_inloop
 
+.dsfc_shadow_lowpri:
+    tst     r9, #0x10000
+    bne     .dsfc_TileFlip_sh_lp
+
+.dsfc_TileNorm_sh_lp:
+    TileNormSh_onlyop_lp
+    b       .dsfc_inloop
+
+.dsfc_TileFlip_sh_lp:
+    TileFlipSh_onlyop_lp
+    b       .dsfc_inloop
+
 .pool
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@ -776,11 +1090,12 @@ DrawSpritesFromCache:
 @ + 0  :    hhhhvvvv ab--hhvv yyyyyyyy yyyyyyyy // a: offscreen h, b: offs. v, h: horiz. size
 @ + 4  :    xxxxxxxx xxxxxxxx pccvhnnn nnnnnnnn // x: x coord + 8
 
-.global DrawSprite @ unsigned int *sprite, int **hc, int sh
+.global DrawSprite @ unsigned int *sprite, int sh, int acc_sprites
 
 DrawSprite:
     stmfd   sp!, {r4-r9,r11,lr}
 
+    orr     r8, r2, r1, lsl #4
     ldr     r3, [r0]        @ sprite[0]
     ldr     r7, =Scanline
     mov     r6, r3, lsr #28
@@ -794,44 +1109,53 @@ DrawSprite:
     ldr     r9, [r0, #4]
     sub     r7, r7, r4, asr #16 @ r7=row=Scanline-sy
 
-    tst     r2, r2
     mov     r2, r9, asr #16 @ r2=sx
-    bic     r9, r9, #0xfe000000
-    orrne   r9, r9, #1<<31  @ r9=code|(sh<<31)
+    mov     r9, r9, lsl #16
+    mov     r9, r9, lsr #16
+    orr     r9, r9, r8, lsl #27 @ r9=code|sh[31]|as[27]
 
     tst     r9, #0x1000
     movne   r4, r5, lsl #3
     subne   r4, r4, #1
     subne   r7, r4, r7      @ if (code&0x1000) row=(height<<3)-1-row; // Flip Y
 
-    mov     r8, r9, lsl #21
-    mov     r8, r8, lsr #21
-    add     r8, r8, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down
-    
+    add     r8, r9, r7, lsr #3 @ tile+=row>>3; // Tile number increases going down
     tst     r9, #0x0800
     mlane   r8, r5, r6, r8  @ if (code&0x0800) { tile+=delta*(width-1);
     rsbne   r5, r5, #0      @ delta=-delta; } // r5=delta now
 
-    mov     r8, r8, lsl #4
+    mov     r8, r8, lsl #21
+    mov     r8, r8, lsr #17
     and     r7, r7, #7
     add     r8, r8, r7, lsl #1 @ tile+=(row&7)<<1; // Tile address
 
     tst     r9, #0x8000
-    bne     .dspr_cache       @ if(code&0x8000) // high priority - cache it
-
+    tsteq   r9, #(1<<27)
+    bne     .dspr_cache       @ if(code&0x8000) || as
+    tst     r6, #0x4000
+    tstne   r6, #0x2000
+    tstne   r9, #(1<<31)
+    bne     .dspr_cache       @ (sh && pal == 0x30)
+
+.dspr_continue:
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r12,#0xf
+    ldr     r11,[r11]
+.else
     ldr     r11,=HighCol
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
     mov     r12,#0xf
+.endif
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
 
     mov     r5, r5, lsl #4     @ delta<<=4; // Delta of address
     and     r4, r9, #0x6000
     orr     r9, r9, r4, lsl #16
-    orr     r9, r9, #0x10000000 @ r9=scc1 ???? ... <code> (s=shadow/hilight, cc=pal)
+    orrs    r9, r9, #0x10000000 @ r9=scc1 a??? ... <code> (s=shadow/hilight, cc=pal, a=acc_spr)
 
-    tst     r9, #1<<31
     mov     r3, r4, lsr #9     @ r3=pal=((code>>9)&0x30);
-    orrne   r3, r3, #0x40      @ shadow by default
+    orrmi   r3, r3, #0x40      @ shadow by default
 
     add     r6, r6, #1         @ inc now
     adds    r0, r2, #0         @ mov sx to r0 and set ZV flags
@@ -863,17 +1187,22 @@ DrawSprite:
     beq     .dspr_SingleColor @ tileline singlecolor 
 
     tst     r9, #0x0800
-    beq     .dspr_TileNorm
+    bne     .dspr_TileFlip
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlip r12
-    b       .dspr_loop
-
 @ scratch: r4, r7
 .dspr_TileNorm:
     TileNorm r12
     b       .dspr_loop
 
+.dspr_TileFlip:
+    TileFlip r12
+    b       .dspr_loop
+
+.dspr_singlec_sh:
+    cmp     r2, #0xe0000000
+    bcs     .dspr_loop          @ operator tileline, ignore
+
 .dspr_SingleColor:
     and     r4, r2, #0xf
     orr     r4, r3, r4
@@ -892,47 +1221,42 @@ DrawSprite:
     beq     .dspr_singlec_sh
 
     tst     r9, #0x0800
-    beq     .dspr_TileNorm_sh
+    bne     .dspr_TileFlip_sh
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern
-    TileFlipSh
-    b       .dspr_loop
-
 .dspr_TileNorm_sh:
-    TileNormSh
-    b       .dspr_loop
-
-.dspr_singlec_sh:
-    cmp     r2, #0xe0000000
-    bcc     .dspr_SingleColor   @ normal tileline
-    tst     r2, #0x10000000
-    bne     .dspr_sh_sh
-    TileSingleHi
+    TileNormSh_noop
     b       .dspr_loop
 
-.dspr_sh_sh:
-    TileSingleSh
+.dspr_TileFlip_sh:
+    TileFlipSh_noop
     b       .dspr_loop
 
 
 .dspr_cache:
-    @ *(*hc)++ = (tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|((code>>9)&0x30)|((sprite[0]>>24)&0xf);
+    @ *HighCacheS_ptr++ = ((code&0x8000)<<16)|(tile<<16)|((code&0x0800)<<5)|((sx<<6)&0x0000ffc0)|pal|((sprite[0]>>16)&0xf);
+    ldr     r1, =HighCacheS_ptr
     mov     r4, r8, lsl #16     @ tile
     tst     r9, #0x0800
     orrne   r4, r4, #0x10000    @ code&0x0800
-    mov     r2, r2, lsl #22
-    orr     r4, r4, r2, lsr #16 @ (sx<<6)&0x0000ffc0
-    and     r2, r9, #0x6000
-    orr     r4, r4, r2, lsr #9  @ (code>>9)&0x30
+    mov     r0, r2, lsl #22
+    orr     r4, r4, r0, lsr #16 @ (sx<<6)&0x0000ffc0
+    and     r0, r9, #0x6000
+    orr     r4, r4, r0, lsr #9  @ (code>>9)&0x30
     mov     r3, r3, lsl #12
-    ldr     r2, [r1]
     orr     r4, r4, r3, lsr #28 @ (sprite[0]>>24)&0xf
 
-    str     r4, [r2], #4
-    str     r2, [r1]
+    ldr     r0, [r1]
+    tst     r9, #0x8000
+    orrne   r4, r4, #0x80000000 @ prio
 
-    ldmfd   sp!, {r4-r9,r11,lr}
-    bx      lr
+    str     r4, [r0], #4
+    str     r0, [r1]
+
+    and     r0, r9, #(1<<27)    @ as
+    teqne   r0,     #(1<<27)    @ (code&0x8000) && !as
+    ldmnefd sp!, {r4-r9,r11,pc}
+    b       .dspr_continue      @ draw anyway if accurate sprites enabled
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
@@ -962,33 +1286,38 @@ DrawWindow:
 
     ldr     r6, =rendstatus
     ldr     lr, =(Pico+0x10000) @ lr=Pico.vram
-    ldrb    r6, [r6]
+    ldr     r6, [r6]
 
     @ fetch the first code now
     ldrh    r7, [lr, r12]
 
     ands    r6, r6, #2            @ we care about bit 1 only
     orr     r6, r6, r2
-    bne     .dw_no_sameprio
 
-    cmp     r2, r7, lsr #15
-    ldmnefd sp!, {r4-r11,pc}      @ assume that whole window uses same priority
+    teqne   r2, r7, lsr #15       @ do prio bits differ?
+    ldmnefd sp!, {r4-r11,pc}      @ yes, assume that whole window uses same priority
 
-.dw_no_sameprio:
     orr     r6, r6, r3, lsl #8    @ shadow mode
 
     sub     r8, r1, r0
-    mov     r8, r8, lsl #1        @ cells
-
-    mvn     r9, #0                @ r9=prevcode=-1
 
     @ cache some stuff to avoid mem access
+.if OVERRIDE_HIGHCOL
+    ldr     r11,=HighCol
+    mov     r8, r8, lsl #1        @ cells
+    ldr     r11,[r11]
+    mvn     r9, #0                @ r9=prevcode=-1
+    add     r11,r11,#8
+.else
     ldr     r11,=(HighCol+8)
-    add     r1, r11, r0, lsl #4 @ r1=pdest
+    mov     r8, r8, lsl #1        @ cells
+    mvn     r9, #0                @ r9=prevcode=-1
+.endif
+    add     r1, r11, r0, lsl #4   @ r1=pdest
     mov     r0, #0xf
     b       .dwloop_enter
 
-    @ r4,r5 & r7 are scratch in this loop
+    @ r4,r5 are scratch in this loop
 .dwloop:
     add     r1, r1, #8
 .dwloop_nor1:
@@ -1029,16 +1358,17 @@ DrawWindow:
     beq     .dw_SingleColor @ tileline singlecolor 
 
     tst     r9, #0x0800
-    beq     .dw_TileNorm
+    bne     .dw_TileFlip
 
     @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern
-    TileFlip r0
-    b       .dwloop
-
 .dw_TileNorm:
     TileNorm r0
     b       .dwloop
 
+.dw_TileFlip:
+    TileFlip r0
+    b       .dwloop
+
 .dw_SingleColor:
     and     r4, r0, r2         @ #0x0000000f
     orr     r4, r3, r4
@@ -1053,24 +1383,13 @@ DrawWindow:
     orreq   r3, r3, #0x40
     beq     .dw_shadow_done
     ldr     r4, [r1]
-    tst     r4, #0x00000080
-    biceq   r4, r4, #0x000000c0
-    tst     r4, #0x00008000
-    biceq   r4, r4, #0x0000c000
-    tst     r4, #0x00800000
-    biceq   r4, r4, #0x00c00000
-    tst     r4, #0x80000000
-    biceq   r4, r4, #0xc0000000
+    mov     r5, #0x3f
+    orr     r5, r5, r5, lsl #8
+    orr     r5, r5, r5, lsl #16
+    and     r4, r4, r5
     str     r4, [r1]
     ldr     r4, [r1,#4]
-    tst     r4, #0x00000080
-    biceq   r4, r4, #0x000000c0
-    tst     r4, #0x00008000
-    biceq   r4, r4, #0x0000c000
-    tst     r4, #0x00800000
-    biceq   r4, r4, #0x00c00000
-    tst     r4, #0x80000000
-    biceq   r4, r4, #0xc0000000
+    and     r4, r4, r5
     str     r4, [r1,#4]
     b       .dw_shadow_done
 
@@ -1180,15 +1499,29 @@ FinalizeLineBGR444:
     bne     .fl_loopcpBGR444_hi
 
     sub     r3, r4, #0x40*3*2
+    mov     r6, #1
 
 
 .fl_noshBGR444:
-    ldr     r1, =(HighCol+8)
+    ldr     r12,=rendstatus
+    eors    r6, r6, #1          @ sh is 0
+    ldr     r12,[r12]
     mov     lr, #0xff
+    tstne   r12,#(1<<2)         @ and PDRAW_ACC_SPRITES
+
+.if OVERRIDE_HIGHCOL
+    ldr     r1, =HighCol
+    movne   lr, #0x3f
+    ldr     r1, [r1]
     mov     lr, lr, lsl #1
+    add     r1, r1, #8
+.else
+    ldr     r1, =(HighCol+8)
+    movne   lr, #0x3f
+    mov     lr, lr, lsl #1
+.endif
 
 .fl_loopBGR444:
-
     ldr     r12, [r1], #4
     subs    r2, r2, #1
 
@@ -1198,11 +1531,10 @@ FinalizeLineBGR444:
     ldrh    r5, [r3, r5]
     and     r6, lr, r12, lsr #15
     ldrh    r6, [r3, r6]
+    and     r12,lr, r12, lsr #23
+    ldrh    r12,[r3, r12]              @ 1c.i.
     orr     r4, r4, r5, lsl #16
-
-    and     r5, lr, r12, lsr #23
-    ldrh    r5, [r3, r5]              @ 2c.i.
-    orr     r5, r6, r5, lsl #16
+    orr     r5, r6, r12,lsl #16
 
     stmia   r0!, {r4,r5}
     bne     .fl_loopBGR444
@@ -1255,14 +1587,16 @@ FinalizeLineBGR444:
     orr     \reg, \reg, r3           @ add blue back
 .endm
 
+.global vidConvCpyRGB565
+
 vidConvCpyRGB565: @ void *to, void *from, int pixels
     stmfd   sp!, {r4-r9,lr}
 
-    mov     r12, r2, lsr #3 @ repeats
+    mov     r12, r2, lsr #3  @ repeats
     mov     lr, #0x001c0000
     orr     lr, lr,  #0x01c  @ lr == pattern 0x001c001c
     mov     r8, #0x00030000
-    orr     r8, r8,  #0x003  @ lr == pattern 0x001c001c
+    orr     r8, r8,  #0x003
 
 .loopRGB565:
     ldmia   r1!, {r4-r7}
@@ -1335,14 +1669,29 @@ FinalizeLineRGB555:
     bne     .fl_loopcpRGB555_hi
 
     sub     r3, r3, #0x40*2
+    mov     r6, #1
 
 .fl_noshRGB555:
+    ldr     r12,=rendstatus
+    eors    r6, r6, #1          @ sh is 0
+    ldr     r12,[r12]
+    mov     lr, #0xff
+    tstne   r12,#(1<<2)         @ and PDRAW_ACC_SPRITES
+    movne   lr, #0x3f
+
+.if OVERRIDE_HIGHCOL
+    ldr     r1, =HighCol
+    ldr     r0, =DrawLineDest
+    ldr     r1, [r1]
+    ldr     r0, [r0]
+    add     r1, r1, #8
+.else
     ldr     r0, =DrawLineDest
     ldr     r1, =(HighCol+8)
     ldr     r0, [r0]
+.endif
 
     ldrb    r12, [r8, #12]
-    mov     lr, #0xff
     mov     lr, lr, lsl #1
 
     tst     r12, #1
@@ -1357,8 +1706,14 @@ FinalizeLineRGB555:
     addeq   r0, r0, #32*2
 
 .fl_no32colRGB555:
-.fl_loopRGB555:
 
+.if UNALIGNED_DRAWLINEDEST
+    @ this is basically for Gizmondo, which has unaligned odd lines in the framebuffer
+    tst     r0, #2
+    bne     .fl_RGB555u
+.endif
+
+.fl_loopRGB555:
     ldr     r12, [r1], #4
     ldr     r7,  [r1], #4
 
@@ -1380,12 +1735,12 @@ FinalizeLineRGB555:
     ldrh    r6, [r3, r6]
     and     r12,lr, r7, lsr #15
     ldrh    r12,[r3, r12]
+    and     r7, lr, r7, lsr #23
+    ldrh    r7, [r3, r7]
     orr     r8, r8, r6, lsl #16
 
-    and     r6, lr, r7, lsr #23
-    ldrh    r6, [r3, r6]             @ 1 cycle interlock here (r6)
     subs    r2, r2, #1
-    orr     r12,r12, r6, lsl #16
+    orr     r12,r12, r7, lsl #16
 
     stmia   r0!, {r4,r5,r8,r12}
     bne     .fl_loopRGB555
@@ -1399,6 +1754,11 @@ FinalizeLineRGB555:
     mov     r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
     orr     r9, r9, #0x00e7
 
+.if UNALIGNED_DRAWLINEDEST
+    tst     r0, #2
+    bne     .fl_32scale_RGB555u
+.endif
+
 .fl_loop32scale_RGB555:
     ldr     r12, [r1], #4
     ldr     r7,  [r1], #4
@@ -1455,6 +1815,121 @@ FinalizeLineRGB555:
     ldmfd   sp!, {r4-r8,lr}
     bx      lr
 
+.if UNALIGNED_DRAWLINEDEST
+    @ unaligned versions of loops
+    @ warning: starts drawing 2bytes before dst
+
+.fl_RGB555u:
+    sub     r0, r0, #2              @ initial adjustment
+    mov     r8, #0
+
+.fl_loopRGB555u:
+    ldr     r12, [r1], #4
+    ldr     r7,  [r1], #4
+
+    and     r6, lr, r12,lsl #1
+    ldrh    r6, [r3, r6]
+    and     r5, lr, r12,lsr #7
+    ldrh    r5, [r3, r5]
+    orr     r4, r8, r6, lsl #16
+
+    and     r6, lr, r12,lsr #15
+    ldrh    r6, [r3, r6]
+    and     r8, lr, r12,lsr #23
+    ldrh    r8, [r3, r8]
+    orr     r5, r5, r6, lsl #16
+
+    and     r6, lr, r7, lsl #1
+    ldrh    r6, [r3, r6]
+    and     r12,lr, r7, lsr #7
+    ldrh    r12,[r3, r12]
+    orr     r6, r8, r6, lsl #16
+
+    and     r8, lr, r7, lsr #15
+    ldrh    r8, [r3, r8]
+    and     r7, lr, r7, lsr #23
+
+    subs    r2, r2, #1
+    orr     r12,r12,r8, lsl #16
+    ldrh    r8, [r3, r7]
+
+    stmia   r0!, {r4,r5,r6,r12}
+    bne     .fl_loopRGB555u
+
+    strh    r8, [r0], #2
+
+    ldmfd   sp!, {r4-r8,lr}
+    bx      lr
+
+
+.fl_32scale_RGB555u:
+    sub     r0, r0, #2              @ initial adjustment
+    mov     r4, #0
+
+    @ r9  f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
+.fl_loop32scale_RGB555u:
+    ldr     r12, [r1], #4
+    ldr     r7,  [r1], #4
+
+    and     r6, lr, r12,lsl #1
+    ldrh    r6, [r3, r6]
+    and     r5, lr, r12,lsr #7
+    ldrh    r5, [r3, r5]
+    and     r6, r6, r9, lsl #2
+    orr     r4, r4, r6, lsl #16       @ r4 = pix_d -1, 0
+
+    and     r5, r5, r9, lsl #2
+    sub     r8, r5, r5, lsr #2        @ r8 = 3/4 pix_s 1
+    add     r6, r8, r6, lsr #2        @ r6 = (1/4 pix_s 0) + (3/4 pix_s 1)
+    orr     r5, r6, r5, lsl #15
+
+    and     r6, lr, r12,lsr #15
+    ldrh    r6, [r3, r6]
+    and     r12,lr, r12,lsr #23
+    ldrh    r12,[r3, r12]
+    and     r6, r6, r9, lsl #2
+    add     r5, r5, r6, lsl #15       @ r5 = pix_d 1, 2
+
+    and     r8, lr, r7, lsl #1
+    ldrh    r8, [r3, r8]
+    and     r10,lr, r7, lsr #7
+    ldrh    r10,[r3, r10]
+    and     r12,r12,r9, lsl #2
+    sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2
+    add     r6, r6, r12,lsr #2
+    orr     r6, r6, r12,lsl #16       @ r6 = pix_d 3, 4
+
+    and     r8, r8, r9, lsl #2
+    and     r10,r10,r9, lsl #2
+    sub     r12,r10,r10,lsr #2        @ r12 = 3/4 pix_s 5
+    orr     r8, r8, r8, lsl #14
+    add     r8, r8, r12,lsl #16       @ r8 = pix_d 5, 6
+    and     r12,lr, r7, lsr #15
+    ldrh    r12,[r3, r12]
+    and     r7, lr, r7, lsr #23
+    ldrh    r7, [r3, r7]
+    and     r12,r12,r9, lsl #2
+    add     r10,r10,r12
+    mov     r10,r10,    lsr #1
+    sub     r12,r12,r12,lsr #2        @ r12 = 3/4 pix_s 6
+    orr     r10,r10,r12,lsl #16
+    and     r7, r7, r9, lsl #2
+    add     r10,r10,r7, lsl #14       @ r10 = pix_d 7, 8
+
+    subs    r2, r2, #1
+
+    stmia   r0!, {r4,r5,r6,r8,r10}
+    mov     r4, r7
+    bne     .fl_loop32scale_RGB555u
+
+    strh    r4, [r0], #2
+
+    ldmfd   sp!, {r9,r10}
+    ldmfd   sp!, {r4-r8,lr}
+    bx      lr
+
+.endif @ UNALIGNED_DRAWLINEDEST
+
 
 @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@