ldmfd sp!,{r4-r11,pc}
+
+.global upscale_aspect_row @ void *dst, void *linesx4, u32 row
+upscale_aspect_row:
+ stmfd sp!,{r4-r11,lr}
+ mov lr, #0x0000001f
+ orr lr, lr, #0x0000f800 @ for "unpacked" form of
+ orr lr, lr, #0x07e00000 @ 00000ggg'ggg00000'rrrrr000'000bbbbb
+ mov r12, #0x00000001
+ orr r12,r12,#0x00000800
+ orr r12,r12,#0x00200000 @ rounding constant
+
+ mov r8, #(240/6) @ cols
+
+ add r0, r0, #(240*320)*2
+ add r0, r0, #12*2
+ add r0, r0, r2, lsl #3
+
+uar_loop:
+ ldr r10,[r1]
+ ldr r11,[r1, #240*2*1]
+
+ unpack_lo r4, r10
+ unpack_hi r5, r10
+ unpack_lo r6, r11
+ unpack_hi r7, r11
+
+ ldr r11,[r1, #240*2*2]
+
+ do_3_5 r4, r6
+ orr r2, r2, r2, lsr #16
+ mov r3, r10, lsl #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2]! @ 0,8
+
+ unpack_lo r10,r11
+ unpack_hi r9, r11
+
+ do_3_5 r10,r6
+ orr r2, r2, r2, lsl #16
+ mov r3, r11, lsl #16
+ orr r2, r3, r2, lsr #16
+ str r2, [r0, #4] @ 16,24
+
+ do_3_5 r4, r5
+ orr r3, r2, r2, lsl #16
+
+ do_14_7_7_4 r7, r5, r6, r4
+ orr r2, r2, r2, lsr #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2]! @ 1,9
+
+ ldr r11,[r1, #4]
+
+ do_14_7_7_4 r7, r6, r9, r10
+ orr r3, r2, r2, lsl #16
+
+ do_3_5 r10,r9
+ orr r2, r2, r2, lsr #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #4] @ 17,25
+
+ ldr r10,[r1, #240*2*1+4]
+
+ unpack_lo r4, r11
+ unpack_lo r6, r10
+
+ do_3_5 r4, r5
+ orr r3, r2, r2, lsl #16
+
+ do_14_7_7_4 r7, r5, r6, r4
+ orr r2, r2, r2, lsr #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2]! @ 2,10
+
+ do_3_5 r4, r6
+
+ ldr r4, [r1, #240*2*2+4]
+
+ orr r2, r2, r2, lsr #16
+ mov r3, r11, lsl #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2] @ 3,11
+
+ unpack_lo r5, r4
+
+ do_14_7_7_4 r7, r6, r9, r5
+ orr r3, r2, r2, lsl #16
+
+ do_3_5 r5, r9
+ orr r2, r2, r2, lsr #16
+ mov r2, r2, lsl #16
+ orr r2, r2, r3, lsr #16
+ str r2, [r0, #4] @ 18,26
+
+ do_3_5 r5, r6
+ orr r2, r2, r2, lsl #16
+ mov r3, r4, lsl #16
+ orr r2, r3, r2, lsr #16
+ str r2, [r0, #-240*2+4] @ 19,27
+
+ unpack_hi r5, r11
+ unpack_hi r6, r10
+ unpack_hi r7, r4
+
+ ldr r10,[r1, #8]
+
+ do_3_5 r5, r6
+ orr r2, r2, r2, lsr #16
+ mov r3, r11, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2*2]! @ 4,12
+
+ ldr r11,[r1, #240*2*1+8]
+
+ do_3_5 r7, r6
+ orr r2, r2, r2, lsl #16
+ mov r3, r4, lsr #16
+ mov r3, r3, lsl #16
+ orr r2, r3, r2, lsr #16
+ str r2, [r0, #4] @ 20,28
+
+ unpack_lo r4, r10
+ unpack_lo r9, r11
+
+ ldr r11,[r1, #240*2*2+8]
+
+ do_3_5 r5, r4
+ orr r3, r2, r2, lsl #16
+
+ do_14_7_7_4 r9, r4, r6, r5
+ orr r2, r2, r2, lsr #16
+ mov r2, r2, lsl #16
+ orr r2, r2, r3, lsr #16
+ str r2, [r0, #-240*2]! @ 5,13
+
+ unpack_lo r5, r11
+
+ do_14_7_7_4 r9, r5, r6, r7
+ orr r3, r2, r2, lsl #16
+
+ do_3_5 r7, r5
+ orr r2, r2, r2, lsr #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #4] @ 21,29
+
+ ldr r7, [r1, #240*2*1+8]
+
+ unpack_hi r6, r10
+ unpack_hi r7, r7
+
+ do_3_5 r6, r4
+ orr r3, r2, r2, lsl #16
+
+ do_14_7_7_4 r9, r4, r7, r6
+ orr r2, r2, r2, lsr #16
+ mov r2, r2, lsl #16
+ orr r2, r2, r3, lsr #16
+ str r2, [r0, #-240*2]! @ 6,14
+
+ unpack_hi r4, r11
+
+ do_14_7_7_4 r9, r5, r7, r4
+ orr r3, r2, r2, lsl #16
+
+ do_3_5 r4, r5
+ orr r2, r2, r2, lsr #16
+ mov r3, r3, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #4] @ 22,30
+
+ do_3_5 r6, r7
+ orr r2, r2, r2, lsr #16
+ mov r3, r10, lsr #16
+ orr r2, r3, r2, lsl #16
+ str r2, [r0, #-240*2]! @ 7,15
+
+ do_3_5 r4, r7
+ orr r2, r2, r2, lsl #16
+ mov r3, r11, lsr #16
+ mov r3, r3, lsl #16
+ orr r2, r3, r2, lsr #16
+ str r2, [r0, #4] @ 23,31
+
+ subs r8, r8, #1
+ add r1, r1, #12
+ bne uar_loop
+
+ ldmfd sp!,{r4-r11,pc}
+
+
+@ bonus function
+
+@ input: r2-r5
+@ output: r7,r8
+@ trash: r6
+.macro rb_line_low
+ mov r6, r2, lsl #16
+ mov r7, r3, lsl #16
+ orr r7, r7, r6, lsr #16
+ mov r6, r4, lsl #16
+ mov r8, r5, lsl #16
+ orr r8, r8, r6, lsr #16
+.endm
+
+.macro rb_line_hi
+ mov r6, r2, lsr #16
+ mov r7, r3, lsr #16
+ orr r7, r6, r7, lsl #16
+ mov r6, r4, lsr #16
+ mov r8, r5, lsr #16
+ orr r8, r6, r8, lsl #16
+.endm
+
+.global do_rotated_blit @ void *dst, void *linesx4, u32 y
+do_rotated_blit:
+ stmfd sp!,{r4-r8,lr}
+
+ add r0, r0, #(240*320)*2
+ sub r0, r0, #(240*40)*2
+ sub r0, r0, #(240-40+4)*2 @ y starts from 4
+ add r0, r0, r2, lsl #1
+
+ mov lr, #240/4
+
+rotated_blit_loop:
+ ldr r2, [r1, #240*0*2]
+ ldr r3, [r1, #240*1*2]
+ ldr r4, [r1, #240*2*2]
+ ldr r5, [r1, #240*3*2]
+ rb_line_low
+ stmia r0, {r7,r8}
+ sub r0, r0, #240*2
+ rb_line_hi
+ stmia r0, {r7,r8}
+ sub r0, r0, #240*2
+
+ ldr r2, [r1, #240*0*2+4]
+ ldr r3, [r1, #240*1*2+4]
+ ldr r4, [r1, #240*2*2+4]
+ ldr r5, [r1, #240*3*2+4]
+ rb_line_low
+ stmia r0, {r7,r8}
+ sub r0, r0, #240*2
+ rb_line_hi
+ stmia r0, {r7,r8}
+ sub r0, r0, #240*2
+
+ subs lr, lr, #1
+ add r1, r1, #8
+ bne rotated_blit_loop
+
+ ldmfd sp!,{r4-r8,pc}
+
@ vim:filetype=armasm