+@ 0x12345678 -> 0x34127856
+@ r4=temp, lr=0x00ff00ff
+.macro bswap reg
+ and r4, \reg, lr
+ and \reg, lr, \reg, lsr #8
+ orr \reg, \reg, r4, lsl #8
+.endm
+
+
+@ dest must be halfword aligned, src can be unaligned
+.global memcpy16bswap @ unsigned short *dest, void *src, int count
+
+memcpy16bswap:
+ tst r1, #1
+ bne mcp16bs_cant_align2
+
+ eor r3, r0, r1
+ tst r3, #2
+ bne mcp16bs_cant_align
+
+ tst r0, #2
+ beq mcp16bs_aligned
+ ldrh r3, [r1], #2
+ sub r2, r2, #1
+ orr r3, r3, r3, lsl #16
+ mov r3, r3, lsr #8
+ strh r3, [r0], #2
+
+mcp16bs_aligned:
+ stmfd sp!, {r4,lr}
+ mov lr, #0xff
+ orr lr, lr, lr, lsl #16
+
+ subs r2, r2, #4
+ bmi mcp16bs_fin4
+
+mcp16bs_loop:
+ ldmia r1!, {r3,r12}
+ subs r2, r2, #4
+ bswap r3
+ bswap r12
+ stmia r0!, {r3,r12}
+ bpl mcp16bs_loop
+
+mcp16bs_fin4:
+ tst r2, #2
+ beq mcp16bs_fin2
+ ldr r3, [r1], #4
+ bswap r3
+ str r3, [r0], #4
+
+mcp16bs_fin2:
+ ldmfd sp!, {r4,lr}
+ ands r2, r2, #1
+ bxeq lr
+
+mcp16bs_cant_align:
+ ldrh r3, [r1], #2
+ subs r2, r2, #1
+ orr r3, r3, r3, lsl #16
+ mov r3, r3, lsr #8
+ strh r3, [r0], #2
+ bne mcp16bs_cant_align
+ bx lr
+
+ @ worst case
+mcp16bs_cant_align2:
+ ldrb r3, [r1], #1
+ ldrb r12,[r1], #1
+ subs r2, r2, #1
+ mov r3, r3, lsl #8
+ orr r3, r3, r12
+ strh r3, [r0], #2
+ bne mcp16bs_cant_align2
+ bx lr
+
+
+
.global memcpy32 @ int *dest, int *src, int count
memcpy32: