lots of win32 port work

[picodrive.git] / Pico / Draw.s
diff --git a/Pico/Draw.s b/Pico/Draw.s

index 91a647c..3a3da81 100644 (file)
--- a/Pico/Draw.s
+++ b/Pico/Draw.s
@@ -6,6 +6,7 @@
  @ (c) Copyright 2007, Grazvydas "notaz" Ignotas\r
  @ All Rights Reserved\r
  \r
+.include "port_config.s"\r
  \r
  .extern Pico\r
  .extern PicoOpt\r
@@ -69,7 +70,7 @@
      streqb  r4, [r1,#\offs]\r
  .endm\r
  \r
-@ TileNorm (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits\r
+@ TileNormShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: register with helper pattern 0xf, touches r3 high bits\r
  .macro TileNormShHP\r
      TilePixelShHP 12, 0         @ #0x0000f000\r
      TilePixelShHP  8, 1         @ #0x00000f00\r
@@ -81,7 +82,7 @@
      TilePixelShHP 16, 7         @ #0x000f0000\r
  .endm\r
  \r
-@ TileFlip (r1=pdest, r2=pixels8, r3=pal) r4: scratch, pat: register with helper pattern 0xf\r
+@ TileFlipShHP (r1=pdest, r2=pixels8, r3=pal) r4: scratch, pat: register with helper pattern 0xf\r
  .macro TileFlipShHP\r
      TilePixelShHP 16, 0         @ #0x000f0000\r
      TilePixelShHP 20, 1         @ #0x00f00000\r
@@ -211,9 +212,9 @@
  @   int cells;   // 0x14\r
  @ };\r
  \r
-@ int DrawLayer(int plane, int *hcache, int maxcells, int sh)\r
+@ void DrawLayer(int plane_sh, int *hcache, int cellskip, int maxcells);\r
  \r
-.global DrawLayer @ int plane, int *hcache, int maxcells, int sh\r
+.global DrawLayer\r
  \r
  DrawLayer:\r
      stmfd   sp!, {r4-r11,lr}\r
@@ -221,10 +222,11 @@ DrawLayer:
      ldr     r11, =(Pico+0x22228)  @ Pico.video\r
      mov     r8, #1\r
  \r
-    ldrb    r7, [r11, #16]        @ ??hh??ww\r
+    ldrb    r7, [r11, #16]        @ ??vv??hh\r
  \r
      mov     r6, r1                @ hcache\r
-    orr     r9, r2, r3, lsl #31   @ r9=maxcells|(sh<<31)\r
+    orr     r9, r3, r0, lsl #30\r
+    orr     r9, r9, r2, lsl #8    @ r9=sh[31]|cellskip[15:8]|maxcells[7:0]  (tmp)\r
  \r
      mov     r1, r7, lsl #4\r
      orr     r1, r1, #0x00ff\r
@@ -244,7 +246,7 @@ DrawLayer:
      sub     r5, r5, #1            @ r5=xmask\r
  \r
      @ Find name table:\r
-    tst     r0,  r0\r
+    ands    r0,  r0, #1\r
      ldreqb  r12, [r11, #2]\r
      ldrneb  r12, [r11, #4]\r
  \r
@@ -309,14 +311,25 @@ DrawLayer:
      orrne   r10,r10, #1<<23 @ r10=(cells<<24|sh<<23|hi_not_empty<<22|had_output<<21|ty)\r
      movne   r3, #0x40       @ default to shadowed pal on sh mode\r
  \r
-    mvn     r9, #0          @ r9=prevcode=-1\r
-\r
      cmp     r7, #8\r
      addne   r10,r10, #0x01000000 @ we will loop cells+1 times if there is scroll\r
  \r
+    and     r9, r9, #0xff00\r
+    add     r8, r8, r9, lsr #8   @ tilex+=cellskip\r
+    add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;\r
+    sub     r10,r10,r9, lsl #16  @ cells-=cellskip\r
+\r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
      ldr     r11,=HighCol\r
      mov     r0, #0xf\r
+    ldr     r11,[r11]\r
+.else\r
+    ldr     r11,=HighCol\r
+    mov     r0, #0xf\r
+.endif\r
+\r
+    mvn     r9, #0               @ r9=prevcode=-1\r
      add     r1, r11, r7         @ r1=pdest\r
  \r
  \r
@@ -364,16 +377,17 @@ DrawLayer:
      beq     .DrawStrip_SingleColor @ tileline singlecolor \r
  \r
      tst     r9, #0x0800\r
-    beq     .DrawStrip_TileNorm\r
+    bne     .DrawStrip_TileFlip\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern\r
-    TileFlip r0\r
-    b       .dsloop\r
-\r
  .DrawStrip_TileNorm:\r
      TileNorm r0\r
      b       .dsloop\r
  \r
+.DrawStrip_TileFlip:\r
+    TileFlip r0\r
+    b       .dsloop\r
+\r
  .DrawStrip_SingleColor:\r
      and     r4, r2, #0xf\r
      orr     r4, r3, r4\r
@@ -431,7 +445,7 @@ DrawLayer:
      rsb     r8, r3, #0\r
      mov     r8, r8, lsr #3        @ r8=tilex=(-ts->hscroll)>>3\r
      bic     r8, r8, #0xff000000\r
-    orr     r8, r8, r5, lsl #25   @ r8=(xmask[31:25]|had_output[24]|tilex[15:0])\r
+    orr     r8, r8, r5, lsl #25   @ r8=(xmask[31:25]|had_output[24]|tilex[23:0])\r
  \r
      ldr     r4, =Scanline\r
      orr     r5, r1, r10, lsl #24\r
@@ -442,24 +456,34 @@ DrawLayer:
      add     r7, r1, #1            @ r7=dx=((ts->hscroll-1)&7)+1\r
  \r
      mov     r10,r9, lsl #16\r
-    tst     r0, r0\r
+    tst     r0, #1\r
      orrne   r10,r10, #0x8000\r
      tst     r9, #1<<31\r
      mov     r3, #0\r
      orr     r10,r10, #0xff000000 @ will be adjusted on entering loop\r
-    orrne   r10,r10, #1<<23 @ r10=(cells[31:24]|sh[23]|hi_not_empty[22]|cells_max[21:16]|plane[15]|ty[14:0])\r
+    orrne   r10,r10, #1<<23 @ r10=(cell[31:24]|sh[23]|hi_not_empty[22]|cells_max[21:16]|plane[15]|ty[14:0])\r
      movne   r3, #0x40       @ default to shadowed pal on sh mode\r
  \r
-    mvn     r9, #0          @ r9=prevcode=-1\r
+    cmp     r7, #8\r
+    subne   r10,r10, #0x01000000 @ have hscroll, start with negative cell\r
+\r
+    and     r9, r9, #0xff00\r
+    add     r8, r8, r9, lsr #8   @ tilex+=cellskip\r
+    add     r7, r7, r9, lsr #5   @ dx+=cellskip<<3;\r
+    add     r10,r10,r9, lsl #16  @ cell+=cellskip\r
  \r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
      ldr     r11,=HighCol\r
      mov     r0, #0xf\r
-    add     r1, r11, r7         @ r1=pdest\r
-\r
-    cmp     r7, #8\r
-    subne   r10,r10, #0x01000000 @ have hscroll, start with negative cell\r
+    ldr     r11,[r11]\r
+.else\r
+    ldr     r11,=HighCol\r
+    mov     r0, #0xf\r
+.endif\r
  \r
+    mvn     r9, #0               @ r9=prevcode=-1\r
+    add     r1, r11, r7          @ r1=pdest\r
  \r
      @ r4 & r7 are scratch in this loop\r
  .dsloop_vs_subr1:\r
@@ -527,16 +551,17 @@ DrawLayer:
      beq     .DrawStrip_vs_SingleColor @ tileline singlecolor \r
  \r
      tst     r9, #0x0800\r
-    beq     .DrawStrip_vs_TileNorm\r
+    bne     .DrawStrip_vs_TileFlip\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern\r
-    TileFlip r0\r
-    b       .dsloop_vs\r
-\r
  .DrawStrip_vs_TileNorm:\r
      TileNorm r0\r
      b       .dsloop_vs\r
  \r
+.DrawStrip_vs_TileFlip:\r
+    TileFlip r0\r
+    b       .dsloop_vs\r
+\r
  .DrawStrip_vs_SingleColor:\r
      and     r4, r2, #0xf\r
      orr     r4, r3, r4\r
@@ -628,10 +653,18 @@ DrawLayer:
  BackFill:\r
      stmfd   sp!, {r4-r9,lr}\r
  \r
+.if OVERRIDE_HIGHCOL\r
+    ldr     lr, =HighCol\r
+    mov     r0, r0, lsl #26\r
+    ldr     lr, [lr]\r
+    mov     r0, r0, lsr #26\r
+    add     lr, lr, #8\r
+.else\r
      ldr     lr, =(HighCol+8)\r
-\r
      mov     r0, r0, lsl #26\r
      mov     r0, r0, lsr #26\r
+.endif\r
+\r
      orr     r0, r0, r1, lsl #6\r
      orr     r0, r0, r0, lsl #8\r
      orr     r0, r0, r0, lsl #16\r
@@ -663,18 +696,25 @@ BackFill:
  @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r
  \r
  \r
-.global DrawTilesFromCache @ int *hc, int sh\r
+.global DrawTilesFromCache @ int *hc, int sh, int rlim\r
  \r
  DrawTilesFromCache:\r
      stmfd   sp!, {r4-r8,r11,lr}\r
  \r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
      ldr     r11,=HighCol\r
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram\r
      mov     r12,#0xf\r
+    ldr     r11,[r11]\r
+.else\r
+    ldr     r11,=HighCol\r
+    mov     r12,#0xf\r
+.endif\r
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram\r
  \r
      mvn     r5, #0         @ r5=prevcode=-1\r
-    movs    r8, r1\r
+    ands    r8, r1, #1\r
+    orr     r8, r8, r2, lsl #1\r
      bne     .dtfc_check_rendflags\r
  \r
      @ scratch: r4, r7\r
@@ -682,8 +722,8 @@ DrawTilesFromCache:
      ldr     r6, [r0], #4    @ read code\r
      movs    r1, r6, lsr #16 @ r1=dx;\r
      ldmeqfd sp!, {r4-r8,r11,pc} @ dx is never zero, this must be a terminator, return\r
-    bic     r1, r1, #0xfe00\r
-    add     r1, r11, r1     @ r1=pdest\r
+    bic     r4, r1, #0xfe00\r
+    add     r1, r11, r4     @ r1=pdest\r
  \r
      mov     r7, r6, lsl #16\r
      cmp     r5, r7, lsr #16\r
@@ -701,7 +741,10 @@ DrawTilesFromCache:
      ldr     r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels\r
  \r
  .dtfc_samecode:\r
-    tst     r8, r8\r
+    rsbs    r4, r4, r8, lsr #1\r
+    bmi     .dtfc_cut_tile\r
+\r
+    tst     r8, #1\r
      bne     .dtfc_shadow\r
  \r
      tst     r2, r2\r
@@ -711,16 +754,17 @@ DrawTilesFromCache:
      beq     .dtfc_SingleColor @ tileline singlecolor \r
  \r
      tst     r5, #0x0800\r
-    beq     .dtfc_TileNorm\r
+    bne     .dtfc_TileFlip\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlip r12\r
-    b       .dtfc_loop\r
-\r
  .dtfc_TileNorm:\r
      TileNorm r12\r
      b       .dtfc_loop\r
  \r
+.dtfc_TileFlip:\r
+    TileFlip r12\r
+    b       .dtfc_loop\r
+\r
  .dtfc_SingleColor:\r
      and     r4, r2, #0xf\r
      orr     r4, r3, r4\r
@@ -742,16 +786,17 @@ DrawTilesFromCache:
      beq     .dtfc_SingleColor @ tileline singlecolor \r
  \r
      tst     r5, #0x0800\r
-    beq     .dtfc_TileNormShHP\r
+    bne     .dtfc_TileFlipShHP\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlipShHP\r
-    b       .dtfc_loop\r
-\r
  .dtfc_TileNormShHP:\r
      TileNormShHP\r
      b       .dtfc_loop\r
  \r
+.dtfc_TileFlipShHP:\r
+    TileFlipShHP\r
+    b       .dtfc_loop\r
+\r
  .dtfc_shadow_blank:\r
      ldrb    r4, [r1]        @ 1ci\r
      ldrb    r12,[r1,#1]\r
@@ -788,13 +833,32 @@ DrawTilesFromCache:
      mov     r12, #0xf\r
      b       .dtfc_loop\r
  \r
+.dtfc_cut_tile:\r
+    add     r4, r4, #7      @ 0-6\r
+    mov     r4, r4, lsl #2\r
+    mov     r12,#0xf<<28\r
+    mov     r12,r12,asr r4\r
+    mov     r2, r2, ror #16\r
+    tst     r5, #0x0800     @ flipped?\r
+    mvnne   r12,r12\r
+    and     r2, r2, r12\r
+    mov     r2, r2, ror #16\r
+    mov     r12,#0xf\r
+    tst     r8, #1\r
+    bne     .dtfc_shadow\r
+    tst     r2, r2\r
+    beq     .dtfc_loop\r
+    tst     r5, #0x0800\r
+    beq     .dtfc_TileNorm\r
+    b       .dtfc_TileFlip\r
+\r
  @ check if we have detected layer covered with hi-prio tiles:\r
  .dtfc_check_rendflags:\r
      ldr     r1, =rendstatus\r
      ldr     r2, [r1]\r
      tst     r2, #0xc0\r
      beq     .dtfc_loop\r
-    mov     r8, #0          @ sh/hi mode off\r
+    bic     r8, r8, #1      @ sh/hi mode off\r
      tst     r2, #0x80\r
      bne     .dtfc_loop      @ already processed\r
      orr     r2, r2, #0x80\r
@@ -838,11 +902,17 @@ DrawSpritesFromCache:
      stmfd   sp!, {r4-r11,lr}\r
  \r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
      ldr     r11,=HighCol\r
+    mov     r12,#0xf\r
+    ldr     r11,[r11]\r
+.else\r
+    ldr     r11,=HighCol\r
+    mov     r12,#0xf\r
+.endif\r
      ldr     lr, =(Pico+0x10000) @ lr=Pico.vram\r
      mov     r6, r1, lsl #31\r
      orr     r6, r6, #1<<30\r
-    mov     r12,#0xf\r
  \r
      mov     r10, r0\r
  \r
@@ -903,16 +973,17 @@ DrawSpritesFromCache:
      beq     .dsfc_SingleColor @ tileline singlecolor \r
  \r
      tst     r9, #0x10000\r
-    beq     .dsfc_TileNorm\r
+    bne     .dsfc_TileFlip\r
  \r
      @ TileFlip (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlip r12\r
-    b       .dsfc_inloop\r
-\r
  .dsfc_TileNorm:\r
      TileNorm r12\r
      b       .dsfc_inloop\r
  \r
+.dsfc_TileFlip:\r
+    TileFlip r12\r
+    b       .dsfc_inloop\r
+\r
  .dsfc_SingleColor:\r
      tst     r0, #1              @ not aligned?\r
      and     r4, r2, #0xf\r
@@ -931,16 +1002,17 @@ DrawSpritesFromCache:
      beq     .dsfc_singlec_sh\r
  \r
      tst     r9, #0x10000\r
-    beq     .dsfc_TileNorm_sh\r
+    bne     .dsfc_TileFlip_sh\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlipSh\r
-    b       .dsfc_inloop\r
-\r
  .dsfc_TileNorm_sh:\r
      TileNormSh\r
      b       .dsfc_inloop\r
  \r
+.dsfc_TileFlip_sh:\r
+    TileFlipSh\r
+    b       .dsfc_inloop\r
+\r
  .dsfc_singlec_sh:\r
      cmp     r2, #0xe0000000\r
      bcc     .dsfc_SingleColor   @ normal singlecolor tileline (carry inverted in ARM)\r
@@ -1004,9 +1076,15 @@ DrawSprite:
      bne     .dspr_cache       @ if(code&0x8000) // high priority - cache it\r
  \r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
      ldr     r11,=HighCol\r
-    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram\r
      mov     r12,#0xf\r
+    ldr     r11,[r11]\r
+.else\r
+    ldr     r11,=HighCol\r
+    mov     r12,#0xf\r
+.endif\r
+    ldr     lr, =(Pico+0x10000) @ lr=Pico.vram\r
  \r
      mov     r5, r5, lsl #4     @ delta<<=4; // Delta of address\r
      and     r4, r9, #0x6000\r
@@ -1047,17 +1125,18 @@ DrawSprite:
      beq     .dspr_SingleColor @ tileline singlecolor \r
  \r
      tst     r9, #0x0800\r
-    beq     .dspr_TileNorm\r
+    bne     .dspr_TileFlip\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlip r12\r
-    b       .dspr_loop\r
-\r
  @ scratch: r4, r7\r
  .dspr_TileNorm:\r
      TileNorm r12\r
      b       .dspr_loop\r
  \r
+.dspr_TileFlip:\r
+    TileFlip r12\r
+    b       .dspr_loop\r
+\r
  .dspr_SingleColor:\r
      and     r4, r2, #0xf\r
      orr     r4, r3, r4\r
@@ -1076,16 +1155,17 @@ DrawSprite:
      beq     .dspr_singlec_sh\r
  \r
      tst     r9, #0x0800\r
-    beq     .dspr_TileNorm_sh\r
+    bne     .dspr_TileFlip_sh\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r12: helper pattern\r
-    TileFlipSh\r
-    b       .dspr_loop\r
-\r
  .dspr_TileNorm_sh:\r
      TileNormSh\r
      b       .dspr_loop\r
  \r
+.dspr_TileFlip_sh:\r
+    TileFlipSh\r
+    b       .dspr_loop\r
+\r
  .dspr_singlec_sh:\r
      cmp     r2, #0xe0000000\r
      bcc     .dspr_SingleColor   @ normal tileline\r
@@ -1162,12 +1242,19 @@ DrawWindow:
      orr     r6, r6, r3, lsl #8    @ shadow mode\r
  \r
      sub     r8, r1, r0\r
-    mov     r8, r8, lsl #1        @ cells\r
-\r
-    mvn     r9, #0                @ r9=prevcode=-1\r
  \r
      @ cache some stuff to avoid mem access\r
+.if OVERRIDE_HIGHCOL\r
+    ldr     r11,=HighCol\r
+    mov     r8, r8, lsl #1        @ cells\r
+    ldr     r11,[r11]\r
+    mvn     r9, #0                @ r9=prevcode=-1\r
+    add     r11,r11,#8\r
+.else\r
      ldr     r11,=(HighCol+8)\r
+    mov     r8, r8, lsl #1        @ cells\r
+    mvn     r9, #0                @ r9=prevcode=-1\r
+.endif\r
      add     r1, r11, r0, lsl #4 @ r1=pdest\r
      mov     r0, #0xf\r
      b       .dwloop_enter\r
@@ -1213,16 +1300,17 @@ DrawWindow:
      beq     .dw_SingleColor @ tileline singlecolor \r
  \r
      tst     r9, #0x0800\r
-    beq     .dw_TileNorm\r
+    bne     .dw_TileFlip\r
  \r
      @ (r1=pdest, r2=pixels8, r3=pal) r4: scratch, r0: helper pattern\r
-    TileFlip r0\r
-    b       .dwloop\r
-\r
  .dw_TileNorm:\r
      TileNorm r0\r
      b       .dwloop\r
  \r
+.dw_TileFlip:\r
+    TileFlip r0\r
+    b       .dwloop\r
+\r
  .dw_SingleColor:\r
      and     r4, r0, r2         @ #0x0000000f\r
      orr     r4, r3, r4\r
@@ -1367,9 +1455,17 @@ FinalizeLineBGR444:
  \r
  \r
  .fl_noshBGR444:\r
+.if OVERRIDE_HIGHCOL\r
+    ldr     r1, =HighCol\r
+    mov     lr, #0xff\r
+    ldr     r1, [r1]\r
+    mov     lr, lr, lsl #1\r
+    add     r1, r1, #8\r
+.else\r
      ldr     r1, =(HighCol+8)\r
      mov     lr, #0xff\r
      mov     lr, lr, lsl #1\r
+.endif\r
  \r
  .fl_loopBGR444:\r
  \r
@@ -1439,14 +1535,16 @@ FinalizeLineBGR444:
      orr     \reg, \reg, r3           @ add blue back\r
  .endm\r
  \r
+.global vidConvCpyRGB565\r
+\r
  vidConvCpyRGB565: @ void *to, void *from, int pixels\r
      stmfd   sp!, {r4-r9,lr}\r
  \r
-    mov     r12, r2, lsr #3 @ repeats\r
+    mov     r12, r2, lsr #3  @ repeats\r
      mov     lr, #0x001c0000\r
      orr     lr, lr,  #0x01c  @ lr == pattern 0x001c001c\r
      mov     r8, #0x00030000\r
-    orr     r8, r8,  #0x003  @ lr == pattern 0x001c001c\r
+    orr     r8, r8,  #0x003\r
  \r
  .loopRGB565:\r
      ldmia   r1!, {r4-r7}\r
@@ -1521,9 +1619,17 @@ FinalizeLineRGB555:
      sub     r3, r3, #0x40*2\r
  \r
  .fl_noshRGB555:\r
+.if OVERRIDE_HIGHCOL\r
+    ldr     r1, =HighCol\r
+    ldr     r0, =DrawLineDest\r
+    ldr     r1, [r1]\r
+    ldr     r0, [r0]\r
+    add     r1, r1, #8\r
+.else\r
      ldr     r0, =DrawLineDest\r
      ldr     r1, =(HighCol+8)\r
      ldr     r0, [r0]\r
+.endif\r
  \r
      ldrb    r12, [r8, #12]\r
      mov     lr, #0xff\r
@@ -1541,8 +1647,14 @@ FinalizeLineRGB555:
      addeq   r0, r0, #32*2\r
  \r
  .fl_no32colRGB555:\r
-.fl_loopRGB555:\r
  \r
+.if UNALIGNED_DRAWLINEDEST\r
+    @ this is basically for Gizmondo, which has unaligned odd lines in the framebuffer\r
+    tst     r0, #2\r
+    bne     .fl_RGB555u\r
+.endif\r
+\r
+.fl_loopRGB555:\r
      ldr     r12, [r1], #4\r
      ldr     r7,  [r1], #4\r
  \r
@@ -1564,12 +1676,12 @@ FinalizeLineRGB555:
      ldrh    r6, [r3, r6]\r
      and     r12,lr, r7, lsr #15\r
      ldrh    r12,[r3, r12]\r
+    and     r7, lr, r7, lsr #23\r
+    ldrh    r7, [r3, r7]\r
      orr     r8, r8, r6, lsl #16\r
  \r
-    and     r6, lr, r7, lsr #23\r
-    ldrh    r6, [r3, r6]             @ 1 cycle interlock here (r6)\r
      subs    r2, r2, #1\r
-    orr     r12,r12, r6, lsl #16\r
+    orr     r12,r12, r7, lsl #16\r
  \r
      stmia   r0!, {r4,r5,r8,r12}\r
      bne     .fl_loopRGB555\r
@@ -1583,6 +1695,11 @@ FinalizeLineRGB555:
      mov     r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007\r
      orr     r9, r9, #0x00e7\r
  \r
+.if UNALIGNED_DRAWLINEDEST\r
+    tst     r0, #2\r
+    bne     .fl_32scale_RGB555u\r
+.endif\r
+\r
  .fl_loop32scale_RGB555:\r
      ldr     r12, [r1], #4\r
      ldr     r7,  [r1], #4\r
@@ -1639,6 +1756,121 @@ FinalizeLineRGB555:
      ldmfd   sp!, {r4-r8,lr}\r
      bx      lr\r
  \r
+.if UNALIGNED_DRAWLINEDEST\r
+    @ unaligned versions of loops\r
+    @ warning: starts drawing 2bytes before dst\r
+\r
+.fl_RGB555u:\r
+    sub     r0, r0, #2              @ initial adjustment\r
+    mov     r8, #0\r
+\r
+.fl_loopRGB555u:\r
+    ldr     r12, [r1], #4\r
+    ldr     r7,  [r1], #4\r
+\r
+    and     r6, lr, r12,lsl #1\r
+    ldrh    r6, [r3, r6]\r
+    and     r5, lr, r12,lsr #7\r
+    ldrh    r5, [r3, r5]\r
+    orr     r4, r8, r6, lsl #16\r
+\r
+    and     r6, lr, r12,lsr #15\r
+    ldrh    r6, [r3, r6]\r
+    and     r8, lr, r12,lsr #23\r
+    ldrh    r8, [r3, r8]\r
+    orr     r5, r5, r6, lsl #16\r
+\r
+    and     r6, lr, r7, lsl #1\r
+    ldrh    r6, [r3, r6]\r
+    and     r12,lr, r7, lsr #7\r
+    ldrh    r12,[r3, r12]\r
+    orr     r6, r8, r6, lsl #16\r
+\r
+    and     r8, lr, r7, lsr #15\r
+    ldrh    r8, [r3, r8]\r
+    and     r7, lr, r7, lsr #23\r
+\r
+    subs    r2, r2, #1\r
+    orr     r12,r12,r8, lsl #16\r
+    ldrh    r8, [r3, r7]\r
+\r
+    stmia   r0!, {r4,r5,r6,r12}\r
+    bne     .fl_loopRGB555u\r
+\r
+    strh    r8, [r0], #2\r
+\r
+    ldmfd   sp!, {r4-r8,lr}\r
+    bx      lr\r
+\r
+\r
+.fl_32scale_RGB555u:\r
+    sub     r0, r0, #2              @ initial adjustment\r
+    mov     r4, #0\r
+\r
+    @ r9  f800 07e0 001f | e000 0780 001c | 3800 01e0 0007\r
+.fl_loop32scale_RGB555u:\r
+    ldr     r12, [r1], #4\r
+    ldr     r7,  [r1], #4\r
+\r
+    and     r6, lr, r12,lsl #1\r
+    ldrh    r6, [r3, r6]\r
+    and     r5, lr, r12,lsr #7\r
+    ldrh    r5, [r3, r5]\r
+    and     r6, r6, r9, lsl #2\r
+    orr     r4, r4, r6, lsl #16       @ r4 = pix_d -1, 0\r
+\r
+    and     r5, r5, r9, lsl #2\r
+    sub     r8, r5, r5, lsr #2        @ r8 = 3/4 pix_s 1\r
+    add     r6, r8, r6, lsr #2        @ r6 = (1/4 pix_s 0) + (3/4 pix_s 1)\r
+    orr     r5, r6, r5, lsl #15\r
+\r
+    and     r6, lr, r12,lsr #15\r
+    ldrh    r6, [r3, r6]\r
+    and     r12,lr, r12,lsr #23\r
+    ldrh    r12,[r3, r12]\r
+    and     r6, r6, r9, lsl #2\r
+    add     r5, r5, r6, lsl #15       @ r5 = pix_d 1, 2\r
+\r
+    and     r8, lr, r7, lsl #1\r
+    ldrh    r8, [r3, r8]\r
+    and     r10,lr, r7, lsr #7\r
+    ldrh    r10,[r3, r10]\r
+    and     r12,r12,r9, lsl #2\r
+    sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2\r
+    add     r6, r6, r12,lsr #2\r
+    orr     r6, r6, r12,lsl #16       @ r6 = pix_d 3, 4\r
+\r
+    and     r8, r8, r9, lsl #2\r
+    and     r10,r10,r9, lsl #2\r
+    sub     r12,r10,r10,lsr #2        @ r12 = 3/4 pix_s 5\r
+    orr     r8, r8, r8, lsl #14\r
+    add     r8, r8, r12,lsl #16       @ r8 = pix_d 5, 6\r
+    and     r12,lr, r7, lsr #15\r
+    ldrh    r12,[r3, r12]\r
+    and     r7, lr, r7, lsr #23\r
+    ldrh    r7, [r3, r7]\r
+    and     r12,r12,r9, lsl #2\r
+    add     r10,r10,r12\r
+    mov     r10,r10,    lsr #1\r
+    sub     r12,r12,r12,lsr #2        @ r12 = 3/4 pix_s 6\r
+    orr     r10,r10,r12,lsl #16\r
+    and     r7, r7, r9, lsl #2\r
+    add     r10,r10,r7, lsl #14       @ r10 = pix_d 7, 8\r
+\r
+    subs    r2, r2, #1\r
+\r
+    stmia   r0!, {r4,r5,r6,r8,r10}\r
+    mov     r4, r7\r
+    bne     .fl_loop32scale_RGB555u\r
+\r
+    strh    r4, [r0], #2\r
+\r
+    ldmfd   sp!, {r9,r10}\r
+    ldmfd   sp!, {r4-r8,lr}\r
+    bx      lr\r
+\r
+.endif @ UNALIGNED_DRAWLINEDEST\r
+\r
  \r
  @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\r
  \r