summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
587a5b1)
preloading a couple of cachelines ahead seems to give best results,
close to 30% speedup.
.global bgr555_to_rgb565
bgr555_to_rgb565:
.global bgr555_to_rgb565
bgr555_to_rgb565:
mov r3, #0x07c0
vdup.16 q15, r3
subs r2, r2, #64
blt btr16_end64
0:
mov r3, #0x07c0
vdup.16 q15, r3
subs r2, r2, #64
blt btr16_end64
0:
vldmia r1!, {q0-q3}
vshl.u16 q4, q0, #11
vshl.u16 q5, q1, #11
vldmia r1!, {q0-q3}
vshl.u16 q4, q0, #11
vshl.u16 q5, q1, #11
.global bgr888_to_rgb888
bgr888_to_rgb888:
.global bgr888_to_rgb888
bgr888_to_rgb888:
@ r2 /= 48
mov r2, r2, lsr #4
movw r3, #0x5556
movt r3, #0x5555
umull r12,r2, r3, r2
0:
@ r2 /= 48
mov r2, r2, lsr #4
movw r3, #0x5556
movt r3, #0x5555
umull r12,r2, r3, r2
0:
vld3.8 {d0-d2}, [r1, :64]!
vld3.8 {d3-d5}, [r1, :64]!
vswp d0, d2
vld3.8 {d0-d2}, [r1, :64]!
vld3.8 {d3-d5}, [r1, :64]!
vswp d0, d2
.global bgr888_to_rgb565
bgr888_to_rgb565:
.global bgr888_to_rgb565
bgr888_to_rgb565:
@ r2 /= 48
mov r2, r2, lsr #4
movw r3, #0x5556
@ r2 /= 48
mov r2, r2, lsr #4
movw r3, #0x5556
mov r3, #0x07e0
vdup.16 q15, r3
0:
mov r3, #0x07e0
vdup.16 q15, r3
0:
vld3.8 {d1-d3}, [r1, :64]!
vld3.8 {d5-d7}, [r1, :64]!
vld3.8 {d1-d3}, [r1, :64]!
vld3.8 {d5-d7}, [r1, :64]!