+.global memset32 @ int *dest, int c, int count
+
+memset32:
+ stmfd sp!, {lr}
+
+ mov r3, r1
+ subs r2, r2, #4
+ bmi mst32_fin
+
+ mov r12,r1
+ mov lr, r1
+
+mst32_loop:
+ subs r2, r2, #4
+ stmia r0!, {r1,r3,r12,lr}
+ bpl mst32_loop
+
+mst32_fin:
+ tst r2, #1
+ strne r1, [r0], #4
+
+ tst r2, #2
+ stmneia r0!, {r1,r3}
+
+ ldmfd sp!, {lr}
+ bx lr
+
+
+@ warning: this code relies on palette being strictly RGB555, i.e. bit5=0
+.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines
+
+soft_scale:
+ stmfd sp!,{r4-r11,lr}
+ mov lr, #0xff
+ mov lr, lr, lsl #1
+ mov r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
+ orr r9, r9, #0x00e7
+
+ mov r11,r3 @ r11= line counter
+ mov r3, r1 @ r3 = pal base
+
+ mov r12,#320
+ mul r2, r12,r2
+ add r4, r0, r2, lsl #1 @ r4 = dst_start
+ add r5, r0, r2 @ r5 = src_start
+ mul r12,r11,r12
+ add r0, r4, r12,lsl #1 @ r0 = dst_end
+ add r1, r5, r12 @ r1 = src_end
+
+ mov r2, r11
+
+soft_scale_loop:
+ sub r1, r1, #64 @ skip borders
+ orr r2, r2, #(256/8-1)<<24
+
+soft_scale_loop_line:
+ ldr r12, [r1, #-8]!
+ ldr r7, [r1, #4]
+
+ and r4, lr, r12,lsl #1
+ ldrh r4, [r3, r4]
+ and r5, lr, r12,lsr #7
+ ldrh r5, [r3, r5]
+ and r11,r4, r9, lsl #2
+ orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0
+ and r11,r5, r9, lsl #2
+ sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1
+ add r4, r4, r6, lsl #16 @ pix_d 0, 1
+ and r6, lr, r12,lsr #15
+ ldrh r6, [r3, r6]
+ and r12,lr, r12,lsr #23
+ ldrh r12,[r3, r12]
+
+ mov r11,r6, ror #11
+ adds r5, r11,r5, ror #11
+ mov r5, r5, ror #22
+ bic r5, r5, #0xff000000
+ bic r5, r5, #0x0420 @ set the green bits as they should be
+ orrcs r5, r5, #0x0400
+
+ and r11,r6, r9, lsl #2
+ sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2
+ orr r5, r5, r6, lsl #16
+
+ and r6, lr, r7, lsl #1
+ ldrh r6, [r3, r6]
+ and r11,r12,r9, lsl #2
+ add r5, r5, r11,lsl #14 @ pix_d 2, 3
+ orr r6, r12,r6, lsl #16 @ pix_d 4, 5
+
+ and r12,lr, r7, lsr #7
+ ldrh r12,[r3, r12]
+ and r10,lr, r7, lsr #15
+ ldrh r10,[r3, r10]
+ and r11,r12,r9, lsl #2
+ sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1
+ and r11,r6, r9, lsl #18
+ add r8, r8, r11,lsr #18
+ and r7, lr, r7, lsr #23
+ ldrh r7, [r3, r7]
+
+ mov r11,r10,ror #11
+ adds r12,r11,r12,ror #11
+ mov r12,r12,ror #22
+ bic r12,r12,#0x0420
+ orrcs r12,r12,#0x0400
+ orr r8, r8, r12,lsl #16 @ pix_d 6, 7
+
+ and r11,r10,r9, lsl #2
+ sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2
+ and r11,r7, r9, lsl #2
+ add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3
+ orr r10,r10,r7, lsl #16 @ pix_d 8, 9
+
+ subs r2, r2, #1<<24
+
+ stmdb r0!, {r4,r5,r6,r8,r10}
+ bpl soft_scale_loop_line
+
+ add r2, r2, #1<<24
+ subs r2, r2, #1
+ bne soft_scale_loop
+
+ ldmfd sp!,{r4-r11,lr}
+ bx lr
+
+
+@ void convert2RGB555(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);
+
+.global convert2RGB555
+
+convert2RGB555:
+ stmfd sp!,{r4-r8,lr}
+ mov lr, #0xff
+ mov lr, lr, lsl #1