+@ r4,r6 - counters; r5 - 32x data; r9,r10 - md,32x pal; r11 - md data
+@ r7,r8,r12,lr - temp
+ tst r5, #1
+ ldreqb r8, [r5], #2
+ ldrb r7, [r5, #-1]
+ ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index
+ subs r6, r6, #1
+ blt 0b @ loop_outer
+ cmp r7, r8
+ beq 5f @ check_fill @ +8
+
+3: @ no_fill:
+ mov r12,r7, lsl #1
+ mov lr, r8, lsl #1
+ ldrh r7, [r10,r12]
+ ldrh r8, [r10,lr]
+ add r11,r11,#2
+
+ eor r12,r7, #0x20
+ tst r12,#0x20
+ ldrneb r12,[r11,#-2] @ MD pixel 0
+ eor lr, r8, #0x20
+ cmpne r3, r12, lsl #26 @ MD has bg pixel?
+.if \do_md
+ mov r12,r12,lsl #1
+ ldrneh r7, [r9, r12] @ t = palmd[pmd[0]]
+ tst lr, #0x20
+ ldrneb lr, [r11,#-1] @ MD pixel 1
+ strh r7, [r0], #2
+ cmpne r3, lr, lsl #26 @ MD has bg pixel?
+ mov lr, lr, lsl #1
+ ldrneh r8, [r9, lr] @ t = palmd[pmd[1]]
+ strh r8, [r0], #2
+.else
+ streqh r7, [r0]
+ tst lr, #0x20
+ ldrneb lr, [r11,#-1] @ MD pixel 1
+ add r0, r0, #4
+ cmpne r3, lr, lsl #26 @ MD has bg pixel?
+ streqh r8, [r0, #-2]
+.endif
+ b 2b @ loop_inner
+
+5: @ check_fill
+ @ count pixels, align if needed
+ bic r12,r5, #1
+ ldrh r12,[r12]
+ orr lr, r7, r7, lsl #8
+ cmp r12,lr
+ bne 3b @ no_fill
+
+ tst r5, #1
+ sub lr, r5, #2 @ starting r5 (32x render data start)
+ addeq r5, r5, #2
+ addne r5, r5, #1 @ add for the check above
+ add r6, r6, #1 @ restore from dec
+ orr r7, r7, r7, lsl #8
+6:
+ sub r12,r5, lr
+ ldrh r8, [r5], #2
+ cmp r12,r6, lsl #1
+ ldrh r12,[r5], #2
+ bge 7f @ count_done
+ cmp r8, r7
+ cmpeq r12,r7
+ beq 6b
+
+7: @ count_done
+ sub r5, r5, #4 @ undo readahead
+
+ @ fix alignment and check type
+ sub r8, r5, lr
+ tst r8, #1
+ subne r5, r5, #1
+ subne r8, r8, #1
+
+ and r7, r7, #0xff
+ cmp r8, r6, lsl #1
+ mov r7, r7, lsl #1
+ movgt r8, r6, lsl #1 @ r8=count
+ ldrh r7, [r10,r7]
+ sub r6, r6, r8, lsr #1 @ adjust counter
+ tst r7, #0x20
+ beq 9f @ bg_mode
+
+ add r11,r11,r8
+8:
+ subs r8, r8, #2
+ strgeh r7, [r0], #2
+ strgeh r7, [r0], #2
+ bgt 8b
+ b 2b @ loop_inner
+
+9: @ bg_mode:
+ ldrb r12,[r11],#1 @ MD pixel
+ ldrb lr, [r11],#1
+ cmp r3, lr, lsl #26 @ MD has bg pixel?
+.if \do_md
+ mov r12,r12,lsl #1
+ ldrneh r12,[r9, r12] @ t = palmd[*pmd]
+ moveq r12,r7
+ cmp r3, lr, lsl #26
+ mov lr, lr, lsl #1
+ ldrneh lr, [r9, lr]
+ moveq lr, r7
+ strh r12,[r0], #2
+ strh lr, [r0], #2
+.else
+ streqh r7, [r0]
+ cmp r3, lr, lsl #26
+ streqh r7, [r0, #2]
+ add r0, r0, #4
+.endif
+ subs r8, r8, #2
+ bgt 9b @ bg_mode
+ b 2b @ loop_inner