blit320_640:
stmfd sp!, {r4-r8,lr}
mov r12, #40
+ bic r1, r1, #3
0:
ldmia r1!, {r2-r8,lr}
lhw_str r2, r3
blit320_512:
stmfd sp!, {r4-r8,lr}
mov r12, #32
+ bic r1, r1, #3
0:
ldmia r1!, {r2-r8,lr}
lsl r2, #16
blit320_368:
stmfd sp!, {r4-r8,lr}
mov r12, #23
+ bic r1, r1, #3
0:
ldmia r1!, {r2-r8,lr}
unaligned_str r2, r3 @ 1,2
* See the COPYING file in the top-level directory.
*/
+#include <stdint.h>
#include "cspace.h"
/*
|| (defined(__GNUC__) && __GNUC__ >= 5)) \
&& __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
-#include <stdint.h>
#include <assert.h>
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
void bgr555_to_rgb565(void *dst_, const void *src_, int bytes)
{
- const unsigned int *src = src_;
+ // source can be misaligned, but it's very rare, so just force
+ const unsigned int *src = (const void *)((intptr_t)src_ & ~3);
unsigned int *dst = dst_;
unsigned int x, p, r, g, b;
orr lr, lr, lsl #16
blt 1f
+ @ src can be unaligned, but that's very rare, so just force it.
+ @ The manual says unaligned ldm should fault, and it does on
+ @ cortex-a78's 32bit mode, but curiously on cortex-a8 it just
+ @ works and loads the data correctly.
+ bic r1, r1, #3
+
0:
ldmia r1!, {r3-r10}
subs r2, #4*8
pld [r1, #64*2]
@ Pulls 15-bit BGR color values (which are actually 16 bits) into q0-q3.
@ example: q0 = 0111 1110 0101 0011
- vldmia r1!, {q0-q3}
+ vld1.16 {d0-d3}, [r1]!
+ vld1.16 {d4-d7}, [r1]!
@ Shift BGR color 1 bit to the left, discarding MSB and preparing for vbit.
@ MSB is used for transparency (not needed here, and can mess with green).
@ example: q0 = 1111 1100 1010 0110
vdup.16 q14, r3
0:
pld [r1, #64*2]
- vldmia r1!, {q0-q3}
+ vld1.16 {d0-d3}, [r1]!
+ vld1.16 {d4-d7}, [r1]!
vand.u16 q8, q0, q14
vand.u16 q9, q1, q14
vand.u16 q10, q2, q14
bx lr
-@ vim:filetype=armasm
+@ vim:filetype=armasm:expandtab
#endif
else
{
- src = (void *)((uintptr_t)src & ~3); // align for the blitter
-
for (; h1-- > 0; dest += dstride * 2, src += stride)
{
bgr555_to_rgb565(dest, src, w * 2);